From da0ad6a7cc2632594f04cb7872bc6f5fb158fd6b Mon Sep 17 00:00:00 2001 From: Alexandre Dutra Date: Tue, 31 Mar 2026 23:39:04 +0200 Subject: [PATCH 001/197] OpenAPI: Promote the S3 signing endpoint to the main spec (#15450) * REST: Promote the S3 signing endpoint to the main spec Dev ML discussion: https://lists.apache.org/thread/2kqdqb46j7jww36wwg4txv6pl2hqq9w7 This commit promotes the S3 remote signing endpoint from an AWS-specific implementation to a first-class REST catalog API endpoint. This enables other storage providers (GCS, Azure, etc.) to eventually reuse the same signing endpoint pattern without duplicating the API definition. Summary of changes: - Added `/v1/{prefix}/namespaces/{namespace}/tables/{table}/sign/{provider}` endpoint to the main REST catalog OpenAPI spec. - Defined `RemoteSignRequest`, `RemoteSignResult` and `RemoteSignResponse` schemas. - Defined a new `provider` request body parameter in order to disambiguate requests from different storage providers. - Deprecated the separate `s3-signer-open-api.yaml` spec from the AWS module (for removal). - Updated the Python client. --- .../main/resources/s3-signer-open-api.yaml | 19 ++- build.gradle | 1 + open-api/Makefile | 2 + open-api/rest-catalog-open-api.py | 45 ++++++- open-api/rest-catalog-open-api.yaml | 112 ++++++++++++++++-- 5 files changed, 165 insertions(+), 14 deletions(-) diff --git a/aws/src/main/resources/s3-signer-open-api.yaml b/aws/src/main/resources/s3-signer-open-api.yaml index 3d719c515b2a..0b98fcc59eff 100644 --- a/aws/src/main/resources/s3-signer-open-api.yaml +++ b/aws/src/main/resources/s3-signer-open-api.yaml @@ -17,19 +17,22 @@ # under the License. # +# ⚠️ WARNING: this API is deprecated. Use the new remote signing endpoint instead, +# see open-api/rest-catalog-open-api.yaml. + --- openapi: 3.0.3 info: - title: Apache Iceberg S3 Signer API + title: "[DEPRECATED] Apache Iceberg S3 Signer API" license: name: Apache 2.0 url: https://www.apache.org/licenses/LICENSE-2.0.html version: 0.0.1 description: - Defines the specification for the S3 Signer API. + "[DEPRECATED] Defines the specification for the S3 Signer API." servers: - url: "{scheme}://{host}/{basePath}" - description: Server URL when the port can be inferred from the scheme + description: "[DEPRECATED] Server URL when the port can be inferred from the scheme" variables: scheme: description: The scheme of the URI, either http or https. @@ -41,7 +44,7 @@ servers: description: Optional prefix to be prepended to all routes default: "" - url: "{scheme}://{host}:{port}/{basePath}" - description: Generic base server URL, with all parts configurable + description: "[DEPRECATED] Generic base server URL, with all parts configurable" variables: scheme: description: The scheme of the URI, either http or https. @@ -61,9 +64,10 @@ paths: /v1/aws/s3/sign: post: + deprecated: true tags: - S3 Signer API - summary: Remotely signs S3 requests + summary: "[DEPRECATED] Remotely signs S3 requests" operationId: signS3Request requestBody: description: The request containing the headers to be signed @@ -95,6 +99,7 @@ components: schemas: S3Headers: + deprecated: true type: object additionalProperties: type: array @@ -102,6 +107,7 @@ components: type: string S3SignRequest: + deprecated: true required: - region - uri @@ -133,7 +139,8 @@ components: responses: S3SignResponse: - description: The response containing signed & unsigned headers. The server will also send + description: > + [DEPRECATED] The response containing signed & unsigned headers. The server will also send a Cache-Control header, indicating whether the response can be cached (Cache-Control = ["private"]) or not (Cache-Control = ["no-cache"]). content: diff --git a/build.gradle b/build.gradle index bd8c6c4bca72..35e1d6a002f3 100644 --- a/build.gradle +++ b/build.gradle @@ -560,6 +560,7 @@ project(':iceberg-aws') { jvmArgs += project.property('extraJvmArgs') } + // TODO delete once s3-signer-open-api.yaml is removed def s3SignerSpec = "$projectDir/src/main/resources/s3-signer-open-api.yaml" tasks.register('validateS3SignerSpec', org.openapitools.generator.gradle.plugin.tasks.ValidateTask) { inputSpec.set(s3SignerSpec) diff --git a/open-api/Makefile b/open-api/Makefile index 3c2c07936e41..797a2abd9293 100644 --- a/open-api/Makefile +++ b/open-api/Makefile @@ -21,10 +21,12 @@ install: validate-spec: uv run openapi-spec-validator --errors all rest-catalog-open-api.yaml + # TODO remove when s3-signer-open-api.yaml is removed uv run openapi-spec-validator --errors all ../aws/src/main/resources/s3-signer-open-api.yaml lint-spec: uv run yamllint --strict rest-catalog-open-api.yaml + # TODO remove when s3-signer-open-api.yaml is removed uv run yamllint --strict ../aws/src/main/resources/s3-signer-open-api.yaml lint: validate-spec lint-spec diff --git a/open-api/rest-catalog-open-api.py b/open-api/rest-catalog-open-api.py index 32cf975cf5b6..f8b3f5bd3771 100644 --- a/open-api/rest-catalog-open-api.py +++ b/open-api/rest-catalog-open-api.py @@ -1045,6 +1045,43 @@ class PlanTask(RootModel[str]): ) +class MultiValuedMap(RootModel[dict[str, list[str]]]): + """ + A map of string keys where each key can map to multiple string values. + """ + + root: dict[str, list[str]] + + +class RemoteSignRequest(BaseModel): + """ + The request to be signed remotely. + """ + + region: str + uri: str + method: Literal['PUT', 'GET', 'HEAD', 'POST', 'DELETE', 'PATCH', 'OPTIONS'] + headers: MultiValuedMap + properties: dict[str, str] | None = None + body: str | None = Field( + None, + description='Optional body of the request to send to the signing API. This should only be populated for requests where the body of the message contains content which must be validated before a request is signed, such as the S3 DeleteObjects call.', + ) + provider: str | None = Field( + None, + description='The storage provider for which the request is to be signed. The provider should correspond to the scheme used for a storage native URI. For example `s3` for AWS S3 paths. For backwards compatibility, if this is not specified, the provider is assumed to be `s3`.', + ) + + +class RemoteSignResult(BaseModel): + """ + The result of a remote request signing operation. + """ + + uri: str + headers: MultiValuedMap + + class CreateNamespaceRequest(BaseModel): namespace: Namespace properties: dict[str, str] | None = Field( @@ -1435,7 +1472,7 @@ class LoadTableResult(BaseModel): - `s3.access-key-id`: id for credentials that provide access to the data in S3 - `s3.secret-access-key`: secret for credentials that provide access to data in S3 - `s3.session-token`: if present, this value should be used for as the session token - - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `s3-signer-open-api.yaml` specification + - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `RemoteSignRequest` schema section of this spec document. - `s3.cross-region-access-enabled`: if `true`, S3 Cross-Region bucket access is enabled ## Storage Credentials @@ -1443,6 +1480,12 @@ class LoadTableResult(BaseModel): Credentials for ADLS / GCS / S3 / ... are provided through the `storage-credentials` field. Clients must first check whether the respective credentials exist in the `storage-credentials` field before checking the `config` for credentials. + ## Remote Signing + + If remote signing for a specific storage provider is enabled, clients must respect the following configurations when creating a remote signer client: + - `signer.endpoint`: the remote signer endpoint. Required. Can either be a relative path (to be resolved against `signer.uri`) or an absolute URI. + - `signer.uri`: the base URI to resolve `signer.endpoint` against. Optional. Only meaningful if `signer.endpoint` is a relative path. Defaults to the catalog's base URI if not set. + """ metadata_location: str | None = Field( diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index ee0097042534..4b4c9f6730ec 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -1254,6 +1254,40 @@ paths: 5XX: $ref: '#/components/responses/ServerErrorResponse' + /v1/{prefix}/namespaces/{namespace}/tables/{table}/sign: + parameters: + - $ref: '#/components/parameters/prefix' + - $ref: '#/components/parameters/namespace' + - $ref: '#/components/parameters/table' + + post: + tags: + - Catalog API + summary: Remotely signs requests to object storage + operationId: signRequest + requestBody: + description: The request to be signed + content: + application/json: + schema: + $ref: '#/components/schemas/RemoteSignRequest' + required: true + responses: + 200: + $ref: '#/components/responses/RemoteSignResponse' + 400: + $ref: '#/components/responses/BadRequestErrorResponse' + 401: + $ref: '#/components/responses/UnauthorizedResponse' + 403: + $ref: '#/components/responses/ForbiddenResponse' + 419: + $ref: '#/components/responses/AuthenticationTimeoutResponse' + 503: + $ref: '#/components/responses/ServiceUnavailableResponse' + 5XX: + $ref: '#/components/responses/ServerErrorResponse' + /v1/{prefix}/tables/rename: parameters: - $ref: '#/components/parameters/prefix' @@ -1963,12 +1997,8 @@ components: to supply access via any or none of the requested mechanisms. - Specific properties and handling for `vended-credentials` is documented - in the `LoadTableResult` schema section of this spec document. - - - The protocol and specification for `remote-signing` is documented in - the `s3-signer-open-api.yaml` OpenApi spec in the `aws` module. + Specific properties and handling for `vended-credentials` and `remote-signing` + are documented in the `LoadTableResult` schema section of this spec document. required: false schema: @@ -3479,13 +3509,19 @@ components: - `s3.access-key-id`: id for credentials that provide access to the data in S3 - `s3.secret-access-key`: secret for credentials that provide access to data in S3 - `s3.session-token`: if present, this value should be used for as the session token - - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `s3-signer-open-api.yaml` specification + - `s3.remote-signing-enabled`: if `true` remote signing should be performed as described in the `RemoteSignRequest` schema section of this spec document. - `s3.cross-region-access-enabled`: if `true`, S3 Cross-Region bucket access is enabled ## Storage Credentials Credentials for ADLS / GCS / S3 / ... are provided through the `storage-credentials` field. Clients must first check whether the respective credentials exist in the `storage-credentials` field before checking the `config` for credentials. + + ## Remote Signing + + If remote signing for a specific storage provider is enabled, clients must respect the following configurations when creating a remote signer client: + - `signer.endpoint`: the remote signer endpoint. Required. Can either be a relative path (to be resolved against `signer.uri`) or an absolute URI. + - `signer.uri`: the base URI to resolve `signer.endpoint` against. Optional. Only meaningful if `signer.endpoint` is a relative path. Defaults to the catalog's base URI if not set. type: object required: - metadata @@ -4696,6 +4732,59 @@ components: allOf: - $ref: '#/components/schemas/Expression' + MultiValuedMap: + description: A map of string keys where each key can map to multiple string values. + type: object + additionalProperties: + type: array + items: + type: string + + RemoteSignRequest: + description: The request to be signed remotely. + type: object + required: + - region + - uri + - method + - headers + properties: + region: + type: string + uri: + type: string + method: + type: string + enum: ["PUT", "GET", "HEAD", "POST", "DELETE", "PATCH", "OPTIONS"] + headers: + $ref: '#/components/schemas/MultiValuedMap' + properties: + type: object + additionalProperties: + type: string + body: + type: string + description: Optional body of the request to send to the signing API. This should only be populated + for requests where the body of the message contains content which must be validated before a request is + signed, such as the S3 DeleteObjects call. + provider: + type: string + description: The storage provider for which the request is to be signed. The provider should correspond to + the scheme used for a storage native URI. For example `s3` for AWS S3 paths. For backwards compatibility, + if this is not specified, the provider is assumed to be `s3`. + + RemoteSignResult: + description: The result of a remote request signing operation. + type: object + required: + - uri + - headers + properties: + uri: + type: string + headers: + $ref: '#/components/schemas/MultiValuedMap' + ############################# # Reusable Response Objects # ############################# @@ -4977,6 +5066,15 @@ components: schema: $ref: '#/components/schemas/LoadCredentialsResponse' + RemoteSignResponse: + description: The response containing signed & unsigned headers. The server will also send + a Cache-Control header, indicating whether the response can be cached (Cache-Control = ["private"]) + or not (Cache-Control = ["no-cache"]). + content: + application/json: + schema: + $ref: '#/components/schemas/RemoteSignResult' + ####################################### # Common examples of different values # ####################################### From ee1878f3d87c0610e6046bcccf297e459cdab650 Mon Sep 17 00:00:00 2001 From: Anoop Johnson Date: Tue, 31 Mar 2026 17:04:19 -0700 Subject: [PATCH 002/197] API, Core: Introduce foundational types for V4 manifest support (#15049) Introduces foundational types for V4 manifest support These types follow the https://s.apache.org/iceberg-single-file-commit and will be used by subsequent PRs for manifest reading/writing. For now, we are adding these as package-private interfaces in core, and eventually we will move them into api. --- .../java/org/apache/iceberg/FileContent.java | 6 +- .../org/apache/iceberg/DeletionVector.java | 64 +++++++ .../java/org/apache/iceberg/EntryStatus.java | 38 ++++ .../java/org/apache/iceberg/ManifestInfo.java | 113 ++++++++++++ .../java/org/apache/iceberg/TrackedFile.java | 173 ++++++++++++++++++ .../java/org/apache/iceberg/Tracking.java | 109 +++++++++++ .../java/org/apache/iceberg/V2Metadata.java | 18 ++ .../java/org/apache/iceberg/V3Metadata.java | 18 ++ .../org/apache/iceberg/TestTrackedFile.java | 103 +++++++++++ 9 files changed, 640 insertions(+), 2 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/DeletionVector.java create mode 100644 core/src/main/java/org/apache/iceberg/EntryStatus.java create mode 100644 core/src/main/java/org/apache/iceberg/ManifestInfo.java create mode 100644 core/src/main/java/org/apache/iceberg/TrackedFile.java create mode 100644 core/src/main/java/org/apache/iceberg/Tracking.java create mode 100644 core/src/test/java/org/apache/iceberg/TestTrackedFile.java diff --git a/api/src/main/java/org/apache/iceberg/FileContent.java b/api/src/main/java/org/apache/iceberg/FileContent.java index 2c9a2fa51bd2..f977b02a9426 100644 --- a/api/src/main/java/org/apache/iceberg/FileContent.java +++ b/api/src/main/java/org/apache/iceberg/FileContent.java @@ -18,11 +18,13 @@ */ package org.apache.iceberg; -/** Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES. */ +/** Content type stored in a file. */ public enum FileContent { DATA(0), POSITION_DELETES(1), - EQUALITY_DELETES(2); + EQUALITY_DELETES(2), + DATA_MANIFEST(3), + DELETE_MANIFEST(4); private final int id; diff --git a/core/src/main/java/org/apache/iceberg/DeletionVector.java b/core/src/main/java/org/apache/iceberg/DeletionVector.java new file mode 100644 index 000000000000..55bd38dc97be --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/DeletionVector.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import org.apache.iceberg.types.Types; + +/** + * Metadata about a deletion vector. + * + *

Tracks where a DV blob can be read. The DV blob follows the format defined by the + * deletion-vector-v1 blob type in the Puffin spec. + */ +interface DeletionVector { + Types.NestedField LOCATION = + Types.NestedField.required( + 155, "location", Types.StringType.get(), "Location of the file containing the DV"); + Types.NestedField OFFSET = + Types.NestedField.required( + 144, "offset", Types.LongType.get(), "Offset in the file where the DV content starts"); + Types.NestedField SIZE_IN_BYTES = + Types.NestedField.required( + 145, + "size_in_bytes", + Types.LongType.get(), + "Length of the referenced DV content stored in the file"); + Types.NestedField CARDINALITY = + Types.NestedField.required( + 156, + "cardinality", + Types.LongType.get(), + "Number of set bits (deleted rows) in the vector"); + + static Types.StructType schema() { + return Types.StructType.of(LOCATION, OFFSET, SIZE_IN_BYTES, CARDINALITY); + } + + /** Returns the location of the file containing the deletion vector. */ + String location(); + + /** Returns the offset in the file where the deletion vector content starts. */ + long offset(); + + /** Returns the size in bytes of the deletion vector content. */ + long sizeInBytes(); + + /** Returns the number of set bits (deleted rows) in the vector. */ + long cardinality(); +} diff --git a/core/src/main/java/org/apache/iceberg/EntryStatus.java b/core/src/main/java/org/apache/iceberg/EntryStatus.java new file mode 100644 index 000000000000..a013f263d015 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/EntryStatus.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +/** Status of an entry in a manifest file. */ +enum EntryStatus { + EXISTING(0), + ADDED(1), + DELETED(2), + /** Indicates an entry that has been replaced by a column update or DV change. Added in v4. */ + REPLACED(3); + + private final int id; + + EntryStatus(int id) { + this.id = id; + } + + public int id() { + return id; + } +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestInfo.java b/core/src/main/java/org/apache/iceberg/ManifestInfo.java new file mode 100644 index 000000000000..d9a23837c456 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestInfo.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; + +/** Summary information about a manifest referenced by a v4 root manifest entry. */ +interface ManifestInfo { + Types.NestedField ADDED_FILES_COUNT = + Types.NestedField.required( + 504, "added_files_count", Types.IntegerType.get(), "Number of files added"); + Types.NestedField EXISTING_FILES_COUNT = + Types.NestedField.required( + 505, "existing_files_count", Types.IntegerType.get(), "Number of existing files"); + Types.NestedField DELETED_FILES_COUNT = + Types.NestedField.required( + 506, "deleted_files_count", Types.IntegerType.get(), "Number of deleted files"); + Types.NestedField REPLACED_FILES_COUNT = + Types.NestedField.required( + 520, "replaced_files_count", Types.IntegerType.get(), "Number of replaced files"); + Types.NestedField ADDED_ROWS_COUNT = + Types.NestedField.required( + 512, "added_rows_count", Types.LongType.get(), "Number of rows in added files"); + Types.NestedField EXISTING_ROWS_COUNT = + Types.NestedField.required( + 513, "existing_rows_count", Types.LongType.get(), "Number of rows in existing files"); + Types.NestedField DELETED_ROWS_COUNT = + Types.NestedField.required( + 514, "deleted_rows_count", Types.LongType.get(), "Number of rows in deleted files"); + Types.NestedField REPLACED_ROWS_COUNT = + Types.NestedField.required( + 521, "replaced_rows_count", Types.LongType.get(), "Number of rows in replaced files"); + Types.NestedField MIN_SEQUENCE_NUMBER = + Types.NestedField.required( + 516, + "min_sequence_number", + Types.LongType.get(), + "Minimum sequence number of files in this manifest"); + Types.NestedField DV = + Types.NestedField.optional( + 522, "dv", Types.BinaryType.get(), "Deletion vector for manifest entries"); + Types.NestedField DV_CARDINALITY = + Types.NestedField.optional( + 523, + "dv_cardinality", + Types.LongType.get(), + "Number of entries marked as deleted in the DV"); + + static Types.StructType schema() { + return Types.StructType.of( + ADDED_FILES_COUNT, + EXISTING_FILES_COUNT, + DELETED_FILES_COUNT, + REPLACED_FILES_COUNT, + ADDED_ROWS_COUNT, + EXISTING_ROWS_COUNT, + DELETED_ROWS_COUNT, + REPLACED_ROWS_COUNT, + MIN_SEQUENCE_NUMBER, + DV, + DV_CARDINALITY); + } + + /** Returns the number of files added by this manifest. */ + int addedFilesCount(); + + /** Returns the number of existing files referenced by this manifest. */ + int existingFilesCount(); + + /** Returns the number of deleted files in this manifest. */ + int deletedFilesCount(); + + /** Returns the number of replaced files in this manifest. */ + int replacedFilesCount(); + + /** Returns the number of rows in added files. */ + long addedRowsCount(); + + /** Returns the number of rows in existing files. */ + long existingRowsCount(); + + /** Returns the number of rows in deleted files. */ + long deletedRowsCount(); + + /** Returns the number of rows in replaced files. */ + long replacedRowsCount(); + + /** Returns the minimum sequence number of files in this manifest. */ + long minSequenceNumber(); + + /** Returns the deletion vector bitmap, or null if not present. */ + ByteBuffer dv(); + + /** Returns the number of entries marked as deleted in the DV, or null if not present. */ + Long dvCardinality(); +} diff --git a/core/src/main/java/org/apache/iceberg/TrackedFile.java b/core/src/main/java/org/apache/iceberg/TrackedFile.java new file mode 100644 index 000000000000..314e79014ae5 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackedFile.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.stats.ContentStats; +import org.apache.iceberg.types.Types; + +/** A content file with optional deletion vector, tracked by a v4 manifest. */ +interface TrackedFile { + Types.NestedField TRACKING = + Types.NestedField.required( + 147, "tracking", Tracking.schema(), "Tracking information for this entry"); + Types.NestedField CONTENT_TYPE = + Types.NestedField.required( + 134, + "content_type", + Types.IntegerType.get(), + "Type of content: 0=DATA, 2=EQUALITY_DELETES, 3=DATA_MANIFEST, 4=DELETE_MANIFEST"); + Types.NestedField LOCATION = + Types.NestedField.required(100, "location", Types.StringType.get(), "Location of the file"); + Types.NestedField FILE_FORMAT = + Types.NestedField.required( + 101, + "file_format", + Types.StringType.get(), + "String file format name: avro, orc, or parquet"); + Types.NestedField RECORD_COUNT = + Types.NestedField.required( + 103, "record_count", Types.LongType.get(), "Number of records in this file"); + Types.NestedField FILE_SIZE_IN_BYTES = + Types.NestedField.required( + 104, "file_size_in_bytes", Types.LongType.get(), "Total file size in bytes"); + Types.NestedField SPEC_ID = + Types.NestedField.optional( + 141, "spec_id", Types.IntegerType.get(), "Spec ID used to partition the file"); + + int CONTENT_STATS_ID = 146; + String CONTENT_STATS_NAME = "content_stats"; + String CONTENT_STATS_DOC = "Content statistics for this entry"; + + Types.NestedField SORT_ORDER_ID = + Types.NestedField.optional( + 140, "sort_order_id", Types.IntegerType.get(), "ID of the sort order for this file"); + Types.NestedField DELETION_VECTOR = + Types.NestedField.optional( + 148, "deletion_vector", DeletionVector.schema(), "Deletion vector for the data file"); + Types.NestedField MANIFEST_INFO = + Types.NestedField.optional( + 150, + "manifest_info", + ManifestInfo.schema(), + "Metadata fields specific to manifest files"); + Types.NestedField KEY_METADATA = + Types.NestedField.optional( + 131, + "key_metadata", + Types.BinaryType.get(), + "Implementation-specific key metadata for encryption"); + Types.NestedField SPLIT_OFFSETS = + Types.NestedField.optional( + 132, + "split_offsets", + Types.ListType.ofRequired(133, Types.LongType.get()), + "Split offsets for the data file"); + Types.NestedField EQUALITY_IDS = + Types.NestedField.optional( + 135, + "equality_ids", + Types.ListType.ofRequired(136, Types.IntegerType.get()), + "Field ids used to determine row equality in equality delete files"); + + static Types.StructType schemaWithContentStats(Types.StructType contentStatsType) { + return Types.StructType.of( + TRACKING, + CONTENT_TYPE, + LOCATION, + FILE_FORMAT, + RECORD_COUNT, + FILE_SIZE_IN_BYTES, + SPEC_ID, + Types.NestedField.optional( + CONTENT_STATS_ID, CONTENT_STATS_NAME, contentStatsType, CONTENT_STATS_DOC), + SORT_ORDER_ID, + DELETION_VECTOR, + MANIFEST_INFO, + KEY_METADATA, + SPLIT_OFFSETS, + EQUALITY_IDS); + } + + /** Returns the tracking information for this entry. */ + Tracking tracking(); + + /** Returns the type of content stored by this entry. */ + FileContent contentType(); + + /** Returns the location of the file. */ + String location(); + + /** Returns the format of the file. */ + FileFormat fileFormat(); + + /** Returns the number of records in this file. */ + long recordCount(); + + /** Returns the total file size in bytes. */ + long fileSizeInBytes(); + + /** Returns the ID of the partition spec used to partition this file, or null. */ + Integer specId(); + + /** Returns the content stats for this entry. */ + ContentStats contentStats(); + + /** Returns the ID of the sort order for this file, or null. */ + Integer sortOrderId(); + + /** Returns the deletion vector for this entry, or null if there is no deletion vector. */ + DeletionVector deletionVector(); + + /** Returns the manifest summary information, or null for non-manifest entries. */ + ManifestInfo manifestInfo(); + + /** Returns encryption key metadata, or null if the file is not encrypted. */ + ByteBuffer keyMetadata(); + + /** Returns the list of recommended split locations, or null. */ + List splitOffsets(); + + /** Returns the set of field IDs used for equality comparison in equality delete files. */ + List equalityIds(); + + /** Copies this tracked file. */ + TrackedFile copy(); + + /** + * Copies this tracked file with stats only for specific columns. + * + * @param requestedColumnIds table field IDs for which to keep stats + */ + TrackedFile copyWithStats(Set requestedColumnIds); + + /** Copies this tracked file without stats. */ + default TrackedFile copyWithoutStats() { + return copyWithStats(Collections.emptySet()); + } + + /** Returns the manifest location this entry was read from, or null. */ + String manifestLocation(); + + /** Returns the ordinal position of this entry within the manifest. */ + long manifestPos(); +} diff --git a/core/src/main/java/org/apache/iceberg/Tracking.java b/core/src/main/java/org/apache/iceberg/Tracking.java new file mode 100644 index 000000000000..c9467da85fdd --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/Tracking.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; + +/** Tracking information for a v4 manifest entry. */ +interface Tracking { + Types.NestedField STATUS = + Types.NestedField.required( + 0, + "status", + Types.IntegerType.get(), + "Entry status: 0=existing, 1=added, 2=deleted, 3=replaced"); + Types.NestedField SNAPSHOT_ID = + Types.NestedField.optional( + 1, + "snapshot_id", + Types.LongType.get(), + "Snapshot ID where the file was added or deleted"); + Types.NestedField SEQUENCE_NUMBER = + Types.NestedField.optional( + 3, "sequence_number", Types.LongType.get(), "Data sequence number of the file"); + Types.NestedField FILE_SEQUENCE_NUMBER = + Types.NestedField.optional( + 4, + "file_sequence_number", + Types.LongType.get(), + "File sequence number indicating when the file was added"); + Types.NestedField DV_SNAPSHOT_ID = + Types.NestedField.optional( + 5, + "dv_snapshot_id", + Types.LongType.get(), + "Snapshot ID where the DV was added; null if there is no DV"); + Types.NestedField FIRST_ROW_ID = + Types.NestedField.optional( + 142, "first_row_id", Types.LongType.get(), "ID of the first row in the data file"); + Types.NestedField DELETED_POSITIONS = + Types.NestedField.optional( + 6, + "deleted_positions", + Types.BinaryType.get(), + "Bitmap of positions deleted in this snapshot"); + Types.NestedField REPLACED_POSITIONS = + Types.NestedField.optional( + 7, + "replaced_positions", + Types.BinaryType.get(), + "Bitmap of positions replaced in this snapshot"); + + static Types.StructType schema() { + return Types.StructType.of( + STATUS, + SNAPSHOT_ID, + SEQUENCE_NUMBER, + FILE_SEQUENCE_NUMBER, + DV_SNAPSHOT_ID, + FIRST_ROW_ID, + DELETED_POSITIONS, + REPLACED_POSITIONS); + } + + /** Returns the status of the entry. */ + EntryStatus status(); + + /** Returns whether this entry is live. */ + default boolean isLive() { + return status() == EntryStatus.ADDED || status() == EntryStatus.EXISTING; + } + + /** Returns the snapshot ID where the file was added or deleted. */ + Long snapshotId(); + + /** Returns the data sequence number of the file. */ + Long dataSequenceNumber(); + + /** Returns the file sequence number indicating when the file was added. */ + Long fileSequenceNumber(); + + /** Returns the snapshot ID where the DV was added; null if there is no DV. */ + Long dvSnapshotId(); + + /** Returns the ID of the first row in the data file. */ + Long firstRowId(); + + /** Returns the bitmap of positions deleted in this snapshot. */ + ByteBuffer deletedPositions(); + + /** Returns the bitmap of positions replaced in this snapshot. */ + ByteBuffer replacedPositions(); +} diff --git a/core/src/main/java/org/apache/iceberg/V2Metadata.java b/core/src/main/java/org/apache/iceberg/V2Metadata.java index 832e5c383fe5..803905f6b42e 100644 --- a/core/src/main/java/org/apache/iceberg/V2Metadata.java +++ b/core/src/main/java/org/apache/iceberg/V2Metadata.java @@ -93,6 +93,7 @@ private Object get(int pos) { case 2: return wrapped.partitionSpecId(); case 3: + checkContentType(wrapped.content()); return wrapped.content().id(); case 4: if (wrapped.sequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { @@ -428,6 +429,7 @@ public T get(int pos, Class javaClass) { private Object get(int pos) { switch (pos) { case 0: + checkContentType(wrapped.content()); return wrapped.content().id(); case 1: return wrapped.location(); @@ -589,4 +591,20 @@ public F copyWithoutStats() { throw new UnsupportedOperationException("Cannot copy IndexedDataFile wrapper"); } } + + private static void checkContentType(ManifestContent content) { + Preconditions.checkArgument( + content == ManifestContent.DATA || content == ManifestContent.DELETES, + "Unsupported manifest content type for v2: %s", + content); + } + + private static void checkContentType(FileContent content) { + Preconditions.checkArgument( + content == FileContent.DATA + || content == FileContent.POSITION_DELETES + || content == FileContent.EQUALITY_DELETES, + "Unsupported file content type for v2: %s", + content); + } } diff --git a/core/src/main/java/org/apache/iceberg/V3Metadata.java b/core/src/main/java/org/apache/iceberg/V3Metadata.java index 8529d68501d4..4e67d9977e64 100644 --- a/core/src/main/java/org/apache/iceberg/V3Metadata.java +++ b/core/src/main/java/org/apache/iceberg/V3Metadata.java @@ -94,6 +94,7 @@ private Object get(int pos) { case 2: return wrapped.partitionSpecId(); case 3: + checkContentType(wrapped.content()); return wrapped.content().id(); case 4: if (wrapped.sequenceNumber() == ManifestWriter.UNASSIGNED_SEQ) { @@ -454,6 +455,7 @@ public T get(int pos, Class javaClass) { private Object get(int pos) { switch (pos) { case 0: + checkContentType(wrapped.content()); return wrapped.content().id(); case 1: return wrapped.location(); @@ -523,4 +525,20 @@ public Long pos() { return null; } } + + private static void checkContentType(ManifestContent content) { + Preconditions.checkArgument( + content == ManifestContent.DATA || content == ManifestContent.DELETES, + "Unsupported manifest content type for v3: %s", + content); + } + + private static void checkContentType(FileContent content) { + Preconditions.checkArgument( + content == FileContent.DATA + || content == FileContent.POSITION_DELETES + || content == FileContent.EQUALITY_DELETES, + "Unsupported file content type for v3: %s", + content); + } } diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFile.java b/core/src/test/java/org/apache/iceberg/TestTrackedFile.java new file mode 100644 index 000000000000..d468c9352d0e --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFile.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.apache.iceberg.types.Types.NestedField.optional; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import org.apache.iceberg.stats.StatsUtil; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestTrackedFile { + + private static final Schema TABLE_SCHEMA = + new Schema( + optional(1, "id", Types.IntegerType.get()), optional(2, "data", Types.StringType.get())); + + private static final Types.StructType CONTENT_STATS_TYPE = + StatsUtil.contentStatsFor(TABLE_SCHEMA).type().asStructType(); + + @Test + public void schemaWithContentStatsFieldOrder() { + Types.StructType type = TrackedFile.schemaWithContentStats(CONTENT_STATS_TYPE); + List fields = type.fields(); + + assertThat(fields) + .extracting(Types.NestedField::name) + .containsExactly( + "tracking", + "content_type", + "location", + "file_format", + "record_count", + "file_size_in_bytes", + "spec_id", + "content_stats", + "sort_order_id", + "deletion_vector", + "manifest_info", + "key_metadata", + "split_offsets", + "equality_ids"); + } + + @Test + public void schemaWithContentStatsFieldIds() { + Types.StructType type = TrackedFile.schemaWithContentStats(CONTENT_STATS_TYPE); + List fields = type.fields(); + + assertThat(fields) + .extracting(Types.NestedField::fieldId) + .containsExactly(147, 134, 100, 101, 103, 104, 141, 146, 140, 148, 150, 131, 132, 135); + } + + @Test + public void schemaWithContentStatsUsesProvidedType() { + Types.StructType type = TrackedFile.schemaWithContentStats(CONTENT_STATS_TYPE); + Types.NestedField contentStatsField = type.field(TrackedFile.CONTENT_STATS_ID); + + assertThat(contentStatsField.type().asStructType()).isEqualTo(CONTENT_STATS_TYPE); + } + + @Test + public void schemaWithContentStatsReflectsInput() { + Schema smallSchema = new Schema(optional(1, "id", Types.IntegerType.get())); + Schema largeSchema = + new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "data", Types.StringType.get()), + optional(3, "ts", Types.TimestampType.withoutZone())); + + Types.StructType smallStats = StatsUtil.contentStatsFor(smallSchema).type().asStructType(); + Types.StructType largeStats = StatsUtil.contentStatsFor(largeSchema).type().asStructType(); + + Types.StructType smallType = TrackedFile.schemaWithContentStats(smallStats); + Types.StructType largeType = TrackedFile.schemaWithContentStats(largeStats); + + Types.StructType smallResult = + smallType.field(TrackedFile.CONTENT_STATS_ID).type().asStructType(); + Types.StructType largeResult = + largeType.field(TrackedFile.CONTENT_STATS_ID).type().asStructType(); + + assertThat(smallResult.fields()).hasSize(1); + assertThat(largeResult.fields()).hasSize(3); + } +} From 850480018ed371bfbc7c28a68fb0a21a21a23f6b Mon Sep 17 00:00:00 2001 From: Ruijing Li Date: Tue, 31 Mar 2026 17:40:06 -0700 Subject: [PATCH 003/197] Spark 4.1: Fix async microbatch plan bugs (#15670) --- .../source/AsyncSparkMicroBatchPlanner.java | 92 +++++----- .../TestAsyncSparkMicroBatchPlanner.java | 61 +++++++ .../source/TestStructuredStreamingRead3.java | 161 ++++++++++++++++++ 3 files changed, 276 insertions(+), 38 deletions(-) create mode 100644 spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java index 527b41cdcff2..3e442f9917d4 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java @@ -23,7 +23,7 @@ import java.util.LinkedList; import java.util.List; import java.util.concurrent.Executors; -import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; @@ -31,6 +31,7 @@ import org.apache.iceberg.MicroBatches; import org.apache.iceberg.Snapshot; import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadConf; @@ -52,7 +53,7 @@ class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements private final Cache, List> planFilesCache; // Queue to buffer pre-fetched file scan tasks - private final LinkedBlockingQueue> queue; + private final LinkedBlockingDeque> queue; // Background executor for async operations private final ScheduledExecutorService executor; @@ -64,7 +65,6 @@ class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements // Tracking queue state private final AtomicLong queuedFileCount = new AtomicLong(0); private final AtomicLong queuedRowCount = new AtomicLong(0); - private volatile Pair tail; private Snapshot lastQueuedSnapshot; private boolean stopped; @@ -90,10 +90,14 @@ class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements this.minQueuedRows = readConf().maxRecordsPerMicroBatch(); this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; this.planFilesCache = Caffeine.newBuilder().maximumSize(PLAN_FILES_CACHE_MAX_SIZE).build(); - this.queue = new LinkedBlockingQueue<>(); + this.queue = new LinkedBlockingDeque<>(); table().refresh(); - // Synchronously add data to the queue to meet our initial constraints + + // Synchronously add data to the queue to meet our initial constraints. + // For Trigger.AvailableNow, constructor-time preload is normally initialized from + // latestOffset(...) with no explicit end offset, so bounded preload must stop at + // Trigger.AvailableNow snapshot. fillQueue(initialOffset, maybeEndOffset); this.executor = @@ -172,17 +176,11 @@ public synchronized List planFiles( long rowsInPlan = 0; do { - // Synchronize here since we are polling, checking for empty and updating tail - synchronized (queue) { - try { - elem = queue.poll(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new RuntimeException("Interrupted while polling queue", e); - } - if (queue.isEmpty()) { - tail = null; - } + try { + elem = queue.pollFirst(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while polling queue", e); } if (elem != null) { @@ -197,7 +195,7 @@ public synchronized List planFiles( result.add(currentTask); // try to peek at the next entry of the queue and see if we should stop - Pair nextElem = queue.peek(); + Pair nextElem = queue.peekFirst(); boolean endOffsetPeek = false; if (nextElem != null) { endOffsetPeek = endOffset.equals(nextElem.first()); @@ -210,12 +208,18 @@ public synchronized List planFiles( } else { LOG.trace("planFiles hasn't reached {}, waiting", endOffset); } - } while (!shouldTerminate && refreshFailedThrowable == null); + } while (!shouldTerminate + && refreshFailedThrowable == null + && fillQueueFailedThrowable == null); if (refreshFailedThrowable != null) { throw new RuntimeException("Table refresh failed", refreshFailedThrowable); } + if (fillQueueFailedThrowable != null) { + throw new RuntimeException("Queue filling failed", fillQueueFailedThrowable); + } + LOG.info( "completed planFiles for {}, startOffset: {}, endOffset: {}, files: {}, rows: {}", table().name(), @@ -293,10 +297,12 @@ private StreamingOffset computeLimitedOffset(ReadLimit limit) { queuedFileCount.get(), queuedRowCount.get()); - // Convert to list for indexed access - List> queueList = Lists.newArrayList(queue); - for (int i = 0; i < queueList.size(); i++) { - Pair elem = queueList.get(i); + List> queueSnapshot = Lists.newArrayList(queue); + Pair queueTail = + queueSnapshot.isEmpty() ? null : queueSnapshot.get(queueSnapshot.size() - 1); + + for (int i = 0; i < queueSnapshot.size(); i++) { + Pair elem = queueSnapshot.get(i); long fileRows = elem.second().file().recordCount(); // Hard limit on files - stop BEFORE exceeding @@ -329,13 +335,13 @@ private StreamingOffset computeLimitedOffset(ReadLimit limit) { unpackedLimits.getMaxRows()); } // Return the offset of the NEXT element (or synthesize tail+1) - if (i + 1 < queueList.size()) { + if (i + 1 < queueSnapshot.size()) { LOG.debug( "latestOffset hit row limit at {}, rows: {}, files: {}", - queueList.get(i + 1).first(), + queueSnapshot.get(i + 1).first(), rowsSeen, filesSeen); - return queueList.get(i + 1).first(); + return queueSnapshot.get(i + 1).first(); } else { // This is the last element - return tail+1 StreamingOffset current = elem.first(); @@ -353,8 +359,8 @@ private StreamingOffset computeLimitedOffset(ReadLimit limit) { } // if we got here there aren't enough files to exceed our limits - if (tail != null) { - StreamingOffset tailOffset = tail.first(); + if (queueTail != null) { + StreamingOffset tailOffset = queueTail.first(); // we have to increment the position by 1 since we want to include the tail in the read and // position is non-inclusive StreamingOffset latestOffset = @@ -405,11 +411,7 @@ private void addMicroBatchToQueue( Pair.of(new StreamingOffset(microBatch.snapshotId(), position, shouldScanAllFile), task); queuedFileCount.incrementAndGet(); queuedRowCount.addAndGet(task.file().recordCount()); - // I have to synchronize here so queue and tail can never be out of sync - synchronized (queue) { - queue.add(elem); - tail = elem; - } + queue.addLast(elem); position += 1; } if (LOG.isDebugEnabled()) { @@ -461,8 +463,8 @@ private void fillQueueInitialBuffer(Snapshot startSnapshot) { long targetRows = readConf().asyncQueuePreloadRowLimit(); long targetFiles = readConf().asyncQueuePreloadFileLimit(); - Snapshot tableCurrentSnapshot = table().currentSnapshot(); - if (tableCurrentSnapshot == null) { + Snapshot preloadEndSnapshot = initialPreloadEndSnapshot(); + if (preloadEndSnapshot == null) { return; // Empty table } @@ -478,7 +480,7 @@ private void fillQueueInitialBuffer(Snapshot startSnapshot) { // Continue loading more snapshots within safety limits if (current != null) { while ((queuedRowCount.get() < targetRows || queuedFileCount.get() < targetFiles) - && current.snapshotId() != tableCurrentSnapshot.snapshotId()) { + && current.snapshotId() != preloadEndSnapshot.snapshotId()) { current = nextValidSnapshot(current); if (current != null) { addMicroBatchToQueue( @@ -490,12 +492,26 @@ private void fillQueueInitialBuffer(Snapshot startSnapshot) { } } + private Snapshot initialPreloadEndSnapshot() { + if (lastOffsetForTriggerAvailableNow != null) { + return table().snapshot(lastOffsetForTriggerAvailableNow.snapshotId()); + } + + return table().currentSnapshot(); + } + + @VisibleForTesting + static boolean reachedAvailableNowCap( + Snapshot readFrom, StreamingOffset lastOffsetForTriggerAvailableNow) { + return lastOffsetForTriggerAvailableNow != null + && readFrom != null + && readFrom.snapshotId() == lastOffsetForTriggerAvailableNow.snapshotId(); + } + /** Try to populate the queue with data from unread snapshots */ private void fillQueue(Snapshot readFrom) { // Don't add beyond cap for Trigger.AvailableNow - if (this.lastOffsetForTriggerAvailableNow != null - && readFrom != null - && readFrom.snapshotId() >= this.lastOffsetForTriggerAvailableNow.snapshotId()) { + if (reachedAvailableNowCap(readFrom, lastOffsetForTriggerAvailableNow)) { LOG.debug( "Reached cap snapshot {}, not adding more", this.lastOffsetForTriggerAvailableNow.snapshotId()); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..b6017e2001e7 --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.iceberg.Snapshot; +import org.junit.jupiter.api.Test; + +class TestAsyncSparkMicroBatchPlanner { + + @Test + void reachedAvailableNowCapReturnsTrueOnlyForExactCapSnapshot() { + Snapshot capSnapshot = mockSnapshot(10L); + Snapshot laterSnapshotWithHigherId = mockSnapshot(20L); + Snapshot laterSnapshotWithLowerId = mockSnapshot(5L); + StreamingOffset capOffset = new StreamingOffset(10L, 3L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(capSnapshot, capOffset)).isTrue(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap( + laterSnapshotWithHigherId, capOffset)) + .isFalse(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(laterSnapshotWithLowerId, capOffset)) + .isFalse(); + } + + @Test + void reachedAvailableNowCapReturnsFalseWhenCapOrSnapshotIsMissing() { + Snapshot readFrom = mockSnapshot(10L); + StreamingOffset capOffset = new StreamingOffset(10L, 1L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(readFrom, null)).isFalse(); + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(null, capOffset)).isFalse(); + } + + private Snapshot mockSnapshot(long snapshotId) { + Snapshot snapshot = mock(Snapshot.class); + when(snapshot.snapshotId()).thenReturn(snapshotId); + return snapshot; + } +} diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index 98e83bdd17cc..3957872be721 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -37,6 +37,7 @@ import org.apache.iceberg.DataOperations; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Files; import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; @@ -53,22 +54,28 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkReadConf; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.streaming.DataStreamWriter; import org.apache.spark.sql.streaming.OutputMode; import org.apache.spark.sql.streaming.StreamingQuery; import org.apache.spark.sql.streaming.Trigger; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; @@ -465,6 +472,143 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex assertThat(actualResults).containsExactlyInAnyOrderElementsOf(Iterables.concat(expectedData)); } + @TestTemplate + public void testTriggerAvailableNowCapsAsyncPreloadAfterPrepare() { + List> initialData = + List.of(List.of(new SimpleRecord(1, "one")), List.of(new SimpleRecord(2, "two"))); + appendDataAsMultipleSnapshots(initialData); + + table.refresh(); + long expectedCapSnapshotId = table.currentSnapshot().snapshotId(); + + SparkMicroBatchStream stream = + new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf( + spark, + table, + new CaseInsensitiveStringMap( + ImmutableMap.of( + SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, + async.toString(), + SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, + "1", + SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, + "1", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "10", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "10"))), + table.schema(), + temp.resolve("available-now-cap-checkpoint").toString()); + + try { + stream.prepareForTriggerAvailableNow(); + + appendData(List.of(new SimpleRecord(3, "three"))); + + Offset startOffset = stream.initialOffset(); + Offset firstEndOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + assertThat(firstEndOffset).isNotNull(); + stream.planInputPartitions(startOffset, firstEndOffset); + + Offset secondEndOffset = stream.latestOffset(firstEndOffset, stream.getDefaultReadLimit()); + assertThat(secondEndOffset).isNotNull(); + stream.planInputPartitions(firstEndOffset, secondEndOffset); + + assertThat(stream.latestOffset(secondEndOffset, stream.getDefaultReadLimit())).isNull(); + assertThat(((StreamingOffset) secondEndOffset).snapshotId()).isEqualTo(expectedCapSnapshotId); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testLatestOffsetReturnsNullAfterFinalBatchIsConsumed() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + table.refresh(); + int expectedBatchCount; + try (CloseableIterable tasks = table.newScan().planFiles()) { + expectedBatchCount = Iterables.size(tasks); + } + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "drain-to-null-checkpoint"); + + try { + int plannedBatchCount = 0; + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + InputPartition[] partitions = stream.planInputPartitions(startOffset, endOffset); + assertThat(partitions).isNotEmpty(); + plannedBatchCount += 1; + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + assertThat(plannedBatchCount).isEqualTo(expectedBatchCount); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testPlanInputPartitionsIsIdempotentForSameOffsets() { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "idempotent-plan-files-checkpoint"); + + try { + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + + assertThat(endOffset).isNotNull(); + + InputPartition[] firstPartitions = stream.planInputPartitions(startOffset, endOffset); + InputPartition[] secondPartitions = stream.planInputPartitions(startOffset, endOffset); + + List firstFileLocations = Lists.newArrayList(); + for (InputPartition partition : firstPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + firstFileLocations.add(task.file().location()); + } + } + + List secondFileLocations = Lists.newArrayList(); + for (InputPartition partition : secondPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + secondFileLocations.add(task.file().location()); + } + } + + assertThat(firstFileLocations).containsExactlyInAnyOrderElementsOf(secondFileLocations); + + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + assertThat(stream.planInputPartitions(startOffset, endOffset)).isNotEmpty(); + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + } finally { + stream.stop(); + } + } + @TestTemplate public void testReadStreamOnIcebergThenAddData() throws Exception { List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; @@ -1053,4 +1197,21 @@ private List rowsAvailable(StreamingQuery query) { .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } + + private SparkMicroBatchStream newMicroBatchStream( + Map options, String checkpointDirName) { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + return new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf(spark, table, new CaseInsensitiveStringMap(allOptions)), + table.schema(), + temp.resolve(checkpointDirName).toString()); + } } From fd21d667fb51433f08df8e538054a1461822b328 Mon Sep 17 00:00:00 2001 From: Marius Grama Date: Wed, 1 Apr 2026 16:34:46 +0200 Subject: [PATCH 004/197] GCS: Throw NotFoundException for nonexisting input GCS file (#15734) Signal to the TableOperations that there is no retry needed for files which do not exist. --- .../apache/iceberg/gcp/gcs/TestGcsFileIO.java | 37 ++++++++++++--- .../iceberg/gcp/gcs/GCSExceptionUtil.java | 35 +++++++++++++++ .../apache/iceberg/gcp/gcs/GCSInputFile.java | 4 +- .../iceberg/gcp/gcs/GCSInputStream.java | 14 +++++- .../gcp/gcs/GcsInputStreamWrapper.java | 45 ++++++++++++++++--- .../iceberg/gcp/gcs/TestGCSInputStream.java | 3 ++ .../gcp/gcs/TestGcsInputStreamWrapper.java | 6 ++- 7 files changed, 127 insertions(+), 17 deletions(-) create mode 100644 gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSExceptionUtil.java diff --git a/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java b/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java index b377d24c6042..626aacd17d33 100644 --- a/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java +++ b/gcp/src/integration/java/org/apache/iceberg/gcp/gcs/TestGcsFileIO.java @@ -31,7 +31,6 @@ import com.google.cloud.storage.BlobInfo; import com.google.cloud.storage.BucketInfo; import com.google.cloud.storage.Storage; -import com.google.cloud.storage.StorageException; import com.google.cloud.storage.StorageOptions; import java.io.IOException; import java.io.InputStream; @@ -39,10 +38,12 @@ import java.util.List; import java.util.Random; import java.util.stream.Collectors; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.gcp.GCPProperties; import org.apache.iceberg.io.FileInfo; import org.apache.iceberg.io.IOUtil; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.SeekableInputStream; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.junit.jupiter.api.AfterAll; @@ -222,14 +223,40 @@ public void deletePrefix() { } @Test - public void readMissingLocation() { + public void readMissingLocation() throws IOException { String location = String.format("gs://%s/path/to/data.parquet", BUCKET); + InputFile input = fileIO.newInputFile(location); + + // Creating an input stream or changing the read position in it are local operations + try (SeekableInputStream in = input.newStream()) { + in.seek(1); + } + + try (SeekableInputStream in = input.newStream()) { + assertThatThrownBy(in::read) + .isInstanceOf(NotFoundException.class) + .hasCauseInstanceOf(IOException.class) + .hasMessage("Location does not exist: gs://test-bucket/path/to/data.parquet"); + } + } + + @Test + public void readMissingLocationGcsAnalyticsCoreEnabled() throws IOException { + String location = String.format("gs://%s/path/to/data.parquet", BUCKET); + fileIO.initialize( + ImmutableMap.of( + GCPProperties.GCS_ANALYTICS_CORE_ENABLED, + "true", + GCPProperties.GCS_NO_AUTH, + "true", + GCPProperties.GCS_SERVICE_HOST, + String.format("http://localhost:%d", GCS_EMULATOR_PORT))); InputFile in = fileIO.newInputFile(location); assertThatThrownBy(() -> in.newStream().read()) - .isInstanceOf(IOException.class) - .hasCauseInstanceOf(StorageException.class) - .hasMessageContaining("404 Not Found"); + .isInstanceOf(NotFoundException.class) + .hasCauseInstanceOf(IOException.class) + .hasMessage("Location does not exist: gs://test-bucket/path/to/data.parquet"); } @Test diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSExceptionUtil.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSExceptionUtil.java new file mode 100644 index 000000000000..681a2436e622 --- /dev/null +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSExceptionUtil.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.gcp.gcs; + +import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.StorageException; +import java.io.IOException; +import org.apache.iceberg.exceptions.NotFoundException; + +final class GCSExceptionUtil { + private GCSExceptionUtil() {} + + static void throwNotFoundIfNotPresent(IOException ioException, BlobId blobId) { + if (ioException.getCause() instanceof StorageException storageException + && storageException.getCode() == 404) { + throw new NotFoundException(ioException, "Location does not exist: %s", blobId.toGsUtilUri()); + } + } +} diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java index 497af03bcdaa..12dc71b5a181 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputFile.java @@ -94,11 +94,11 @@ public SeekableInputStream newStream() { private SeekableInputStream newGoogleCloudStorageInputStream() throws IOException { if (null == blobSize) { return new GcsInputStreamWrapper( - GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsItemId()), metrics()); + GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsItemId()), blobId(), metrics()); } return new GcsInputStreamWrapper( - GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsFileInfo()), metrics()); + GoogleCloudStorageInputStream.create(gcsFileSystem(), gcsFileInfo()), blobId(), metrics()); } private GcsItemId gcsItemId() { diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java index 3b41ae21d34e..910e97e0c178 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GCSInputStream.java @@ -127,7 +127,12 @@ public int read() throws IOException { singleByteBuffer.position(0); pos += 1; - channel.read(singleByteBuffer); + try { + channel.read(singleByteBuffer); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } readBytes.increment(); readOperations.increment(); @@ -174,7 +179,12 @@ private int read(ReadChannel readChannel, ByteBuffer buffer, int off, int len) throws IOException { buffer.position(off); buffer.limit(Math.min(off + len, buffer.capacity())); - return readChannel.read(buffer); + try { + return readChannel.read(buffer); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override diff --git a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java index 2e1dfdd73c08..25ba7662dd55 100644 --- a/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java +++ b/gcp/src/main/java/org/apache/iceberg/gcp/gcs/GcsInputStreamWrapper.java @@ -21,6 +21,7 @@ import com.google.api.client.util.Preconditions; import com.google.cloud.gcs.analyticscore.client.GcsObjectRange; import com.google.cloud.gcs.analyticscore.core.GoogleCloudStorageInputStream; +import com.google.cloud.storage.BlobId; import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -37,10 +38,14 @@ class GcsInputStreamWrapper extends SeekableInputStream implements RangeReadable private final Counter readBytes; private final Counter readOperations; private final GoogleCloudStorageInputStream stream; + private final BlobId blobId; - GcsInputStreamWrapper(GoogleCloudStorageInputStream stream, MetricsContext metrics) { + GcsInputStreamWrapper( + GoogleCloudStorageInputStream stream, BlobId blobId, MetricsContext metrics) { Preconditions.checkArgument(null != stream, "Invalid input stream : null"); + Preconditions.checkArgument(null != blobId, "Invalid blobId : null"); this.stream = stream; + this.blobId = blobId; this.readBytes = metrics.counter(FileIOMetricsContext.READ_BYTES, MetricsContext.Unit.BYTES); this.readOperations = metrics.counter(FileIOMetricsContext.READ_OPERATIONS); } @@ -57,7 +62,13 @@ public void seek(long newPos) throws IOException { @Override public int read() throws IOException { - int readByte = stream.read(); + int readByte; + try { + readByte = stream.read(); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } readBytes.increment(); readOperations.increment(); return readByte; @@ -70,7 +81,13 @@ public int read(byte[] b) throws IOException { @Override public int read(byte[] b, int off, int len) throws IOException { - int bytesRead = stream.read(b, off, len); + int bytesRead; + try { + bytesRead = stream.read(b, off, len); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } if (bytesRead > 0) { readBytes.increment(bytesRead); } @@ -80,12 +97,22 @@ public int read(byte[] b, int off, int len) throws IOException { @Override public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { - stream.readFully(position, buffer, offset, length); + try { + stream.readFully(position, buffer, offset, length); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override public int readTail(byte[] buffer, int offset, int length) throws IOException { - return stream.readTail(buffer, offset, length); + try { + return stream.readTail(buffer, offset, length); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override @@ -101,8 +128,12 @@ public void readVectored(List ranges, IntFunction allocat .setByteBufferFuture(fileRange.byteBuffer()) .build()) .collect(Collectors.toList()); - - stream.readVectored(objectRanges, allocate); + try { + stream.readVectored(objectRanges, allocate); + } catch (IOException e) { + GCSExceptionUtil.throwNotFoundIfNotPresent(e, blobId); + throw e; + } } @Override diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java index f367db94264a..8cc85fad72fd 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSInputStream.java @@ -163,6 +163,9 @@ private void readAndCheckRanges( @Test public void testClose() throws Exception { BlobId blobId = BlobId.fromGsUtilUri("gs://bucket/path/to/closed.dat"); + byte[] data = randomData(1024 * 1024); + writeGCSData(blobId, data); + SeekableInputStream closed = new GCSInputStream(storage, blobId, null, gcpProperties, MetricsContext.nullMetrics()); closed.close(); diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java index 2320037bd017..c6eae113d52d 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGcsInputStreamWrapper.java @@ -20,6 +20,7 @@ import com.google.cloud.gcs.analyticscore.client.GcsObjectRange; import com.google.cloud.gcs.analyticscore.core.GoogleCloudStorageInputStream; +import com.google.cloud.storage.BlobId; import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -44,7 +45,10 @@ public class TestGcsInputStreamWrapper { @BeforeEach public void before() { inputStreamWrapper = - new GcsInputStreamWrapper(googleCloudStorageInputStream, MetricsContext.nullMetrics()); + new GcsInputStreamWrapper( + googleCloudStorageInputStream, + BlobId.of("mockbucket", "mockname"), + MetricsContext.nullMetrics()); } @Test From 05d7ece42acccc041f711dcac0dbb03a3c127396 Mon Sep 17 00:00:00 2001 From: Szehon Ho Date: Wed, 1 Apr 2026 09:19:04 -0700 Subject: [PATCH 005/197] Spark 4.1: Control merge schema evolution by table property (#15825) * Spark: Control merge schema evolution by table property Add a new table property write.spark.auto-schema-evolution (default true) that controls whether the AUTOMATIC_SCHEMA_EVOLUTION capability is reported to Spark. When set to false, Spark's MERGE WITH SCHEMA EVOLUTION no longer evolves the target table schema. Also add a guard in SparkWriteBuilder to reject mergeSchema write option when the property is disabled. * Remove unnecessary validation from SparkWriteBuilder The capability removal in SparkTable is sufficient to control schema evolution. The mergeSchema write option path already requires accept-any-schema, making a second gate redundant. * Address review comments - Rename property to write.spark.auto-schema-evolution.enabled - Rename caps to tableCapabilities in computeCapabilities - Add explicit = in ALTER TABLE SET TBLPROPERTIES test SQL --- .../org/apache/iceberg/TableProperties.java | 4 ++ .../extensions/TestMergeSchemaEvolution.java | 45 +++++++++++++++++++ .../iceberg/spark/source/SparkTable.java | 32 +++++++++---- 3 files changed, 73 insertions(+), 8 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 1f778984af17..71991f633d97 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -346,6 +346,10 @@ private TableProperties() {} public static final String SPARK_WRITE_ACCEPT_ANY_SCHEMA = "write.spark.accept-any-schema"; public static final boolean SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT = false; + public static final String SPARK_WRITE_AUTO_SCHEMA_EVOLUTION = + "write.spark.auto-schema-evolution.enabled"; + public static final boolean SPARK_WRITE_AUTO_SCHEMA_EVOLUTION_DEFAULT = true; + public static final String SPARK_WRITE_ADVISORY_PARTITION_SIZE_BYTES = "write.spark.advisory-partition-size-bytes"; diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java index d760d56b7a1d..782321b588a7 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeSchemaEvolution.java @@ -19,10 +19,12 @@ package org.apache.iceberg.spark.extensions; import static org.apache.spark.sql.functions.col; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assumptions.assumeThat; import java.util.Map; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.AfterEach; @@ -260,6 +262,49 @@ public void testMergeWithSchemaEvolutionTypeWidening() { sql("SELECT id, value FROM %s ORDER BY id", selectTarget())); } + @TestTemplate + public void testMergeWithSchemaEvolutionDisabledByTableProperty() { + assumeThat(branch).as("Schema evolution does not work for branches currently").isNull(); + + createAndInitTable( + "id INT, dep STRING", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"software\" }"); + + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'false')", + tableName, TableProperties.SPARK_WRITE_AUTO_SCHEMA_EVOLUTION); + + createOrReplaceView( + "source", + "id INT, dep STRING, salary INT", + "{ \"id\": 1, \"dep\": \"hr\", \"salary\": 100 }\n" + + "{ \"id\": 3, \"dep\": \"finance\", \"salary\": 300 }"); + + sql( + "MERGE WITH SCHEMA EVOLUTION INTO %s AS t USING source AS s " + + "ON t.id == s.id " + + "WHEN MATCHED THEN " + + " UPDATE SET * " + + "WHEN NOT MATCHED THEN " + + " INSERT *", + commitTarget()); + + // Schema should NOT be evolved - 'salary' column should not be added + assertThat(sql("SELECT * FROM %s", selectTarget()).get(0).length) + .as("Table should still have only 2 columns (id, dep)") + .isEqualTo(2); + + ImmutableList expectedRows = + ImmutableList.of( + row(1, "hr"), // updated without salary + row(2, "software"), // kept + row(3, "finance")); // new without salary + assertEquals( + "Should have expected rows without schema evolution", + expectedRows, + sql("SELECT id, dep FROM %s ORDER BY id", selectTarget())); + } + @Override protected Map extraTableProperties() { return Map.of(); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index ae3c0ce0c8bb..07db8c4ed3fe 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -83,20 +83,14 @@ public class SparkTable extends BaseSparkTable private static final Logger LOG = LoggerFactory.getLogger(SparkTable.class); - private static final Set CAPABILITIES = + private static final Set BASE_CAPABILITIES = ImmutableSet.of( - TableCapability.AUTOMATIC_SCHEMA_EVOLUTION, TableCapability.BATCH_READ, TableCapability.BATCH_WRITE, TableCapability.MICRO_BATCH_READ, TableCapability.STREAMING_WRITE, TableCapability.OVERWRITE_BY_FILTER, TableCapability.OVERWRITE_DYNAMIC); - private static final Set CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA = - ImmutableSet.builder() - .addAll(CAPABILITIES) - .add(TableCapability.ACCEPT_ANY_SCHEMA) - .build(); private final Schema schema; // effective schema (not necessarily current table schema) private final Snapshot snapshot; // always set unless table is empty @@ -133,7 +127,7 @@ private SparkTable( this.snapshot = snapshot; this.branch = branch; this.timeTravel = timeTravel; - this.capabilities = acceptAnySchema(table) ? CAPABILITIES_WITH_ACCEPT_ANY_SCHEMA : CAPABILITIES; + this.capabilities = computeCapabilities(table); } public SparkTable copyWithBranch(String newBranch) { @@ -353,6 +347,21 @@ private static SparkTable createWithTimestamp(Table table, AsOfTimestamp timeTra return new SparkTable(table, snapshotId, timeTravel); } + private static Set computeCapabilities(Table table) { + ImmutableSet.Builder tableCapabilities = ImmutableSet.builder(); + tableCapabilities.addAll(BASE_CAPABILITIES); + + if (autoSchemaEvolution(table)) { + tableCapabilities.add(TableCapability.AUTOMATIC_SCHEMA_EVOLUTION); + } + + if (acceptAnySchema(table)) { + tableCapabilities.add(TableCapability.ACCEPT_ANY_SCHEMA); + } + + return tableCapabilities.build(); + } + private static boolean acceptAnySchema(Table table) { return PropertyUtil.propertyAsBoolean( table.properties(), @@ -360,6 +369,13 @@ private static boolean acceptAnySchema(Table table) { TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA_DEFAULT); } + private static boolean autoSchemaEvolution(Table table) { + return PropertyUtil.propertyAsBoolean( + table.properties(), + TableProperties.SPARK_WRITE_AUTO_SCHEMA_EVOLUTION, + TableProperties.SPARK_WRITE_AUTO_SCHEMA_EVOLUTION_DEFAULT); + } + // returns latest snapshot for branch or current snapshot if branch is yet to be created private static Snapshot determineLatestSnapshot(Table table, String branch) { if (branch != null && table.refs().containsKey(branch)) { From 6103dab58f21ae6758cc8be8cea806e3b9970f48 Mon Sep 17 00:00:00 2001 From: Anoop Johnson Date: Wed, 1 Apr 2026 14:22:45 -0700 Subject: [PATCH 006/197] Remove v4 references from javadocs (#15851) This fixes Russell's feedback on https://github.com/apache/iceberg/pull/15049 to avoid version-specific language that will go stale. --- core/src/main/java/org/apache/iceberg/ManifestInfo.java | 2 +- core/src/main/java/org/apache/iceberg/TrackedFile.java | 2 +- core/src/main/java/org/apache/iceberg/Tracking.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/ManifestInfo.java b/core/src/main/java/org/apache/iceberg/ManifestInfo.java index d9a23837c456..a4651b0eadb0 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestInfo.java +++ b/core/src/main/java/org/apache/iceberg/ManifestInfo.java @@ -21,7 +21,7 @@ import java.nio.ByteBuffer; import org.apache.iceberg.types.Types; -/** Summary information about a manifest referenced by a v4 root manifest entry. */ +/** Summary information about a manifest referenced by a root manifest entry. */ interface ManifestInfo { Types.NestedField ADDED_FILES_COUNT = Types.NestedField.required( diff --git a/core/src/main/java/org/apache/iceberg/TrackedFile.java b/core/src/main/java/org/apache/iceberg/TrackedFile.java index 314e79014ae5..78bb7e5288d3 100644 --- a/core/src/main/java/org/apache/iceberg/TrackedFile.java +++ b/core/src/main/java/org/apache/iceberg/TrackedFile.java @@ -25,7 +25,7 @@ import org.apache.iceberg.stats.ContentStats; import org.apache.iceberg.types.Types; -/** A content file with optional deletion vector, tracked by a v4 manifest. */ +/** A file tracked by a manifest. */ interface TrackedFile { Types.NestedField TRACKING = Types.NestedField.required( diff --git a/core/src/main/java/org/apache/iceberg/Tracking.java b/core/src/main/java/org/apache/iceberg/Tracking.java index c9467da85fdd..46b14e549a35 100644 --- a/core/src/main/java/org/apache/iceberg/Tracking.java +++ b/core/src/main/java/org/apache/iceberg/Tracking.java @@ -21,7 +21,7 @@ import java.nio.ByteBuffer; import org.apache.iceberg.types.Types; -/** Tracking information for a v4 manifest entry. */ +/** Tracking information for a manifest entry. */ interface Tracking { Types.NestedField STATUS = Types.NestedField.required( From ff298a676d287f6bb065c030aca2ef66bbab4fed Mon Sep 17 00:00:00 2001 From: Alex Stephen <1325798+rambleraptor@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:38:54 -0700 Subject: [PATCH 007/197] BigQuery: Fix dependency leak into runtime Jars (#15655) --- build.gradle | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build.gradle b/build.gradle index 35e1d6a002f3..aeb613c8bade 100644 --- a/build.gradle +++ b/build.gradle @@ -707,10 +707,10 @@ project(':iceberg-bigquery') { implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow') - implementation platform(libs.google.libraries.bom) + compileOnly platform(libs.google.libraries.bom) compileOnly "com.google.cloud:google-cloud-storage" - implementation "com.google.cloud:google-cloud-bigquery" - implementation "com.google.cloud:google-cloud-core" + compileOnly "com.google.cloud:google-cloud-bigquery" + compileOnly "com.google.cloud:google-cloud-core" testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') From d204e5e2dff8a5a7da6aa39c4be995633d223b22 Mon Sep 17 00:00:00 2001 From: Eunbin Son <58901024+thswlsqls@users.noreply.github.com> Date: Thu, 2 Apr 2026 06:51:47 +0900 Subject: [PATCH 008/197] Spec: Fix typos and stray formatting in gcm-stream-spec and puffin-spec (#15813) --- format/gcm-stream-spec.md | 4 ++-- format/puffin-spec.md | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/format/gcm-stream-spec.md b/format/gcm-stream-spec.md index 4d241ca3ef24..8168780e118b 100644 --- a/format/gcm-stream-spec.md +++ b/format/gcm-stream-spec.md @@ -41,7 +41,7 @@ The output stream, produced by a metadata or data writer, is split into equal-si ## Encryption algorithm -AES GCM Stream uses the standard AEG GCM cipher, and supports all AES key sizes: 128, 192 and 256 bits. +AES GCM Stream uses the standard AES GCM cipher, and supports all AES key sizes: 128, 192 and 256 bits. AES GCM is an authenticated encryption. Besides data confidentiality (encryption), it supports two levels of integrity verification (authentication): of the data (default), and of the data combined with an optional AAD (“additional authenticated data”). An AAD is a free text to be authenticated, together with the data. The structure of AES GCM Stream AADs is described below. @@ -80,7 +80,7 @@ AES GCM Stream encrypts all blocks by the GCM cipher, without padding. The AES G ### Additional Authenticated Data -The AES GCM cipher protects against byte replacement inside a ciphertext block - but, without an AAD, it can't prevent replacement of one ciphertext block with another (encrypted with the same key). AES GCM Stream leverages AADs to protect against swapping ciphertext blocks inside a file or between files. AES GCM Stream can also protect against swapping full files - for example, replacement of a metadata file with an old version. AADs are built to reflects the identity of a file and of the blocks inside the file. +The AES GCM cipher protects against byte replacement inside a ciphertext block - but, without an AAD, it can't prevent replacement of one ciphertext block with another (encrypted with the same key). AES GCM Stream leverages AADs to protect against swapping ciphertext blocks inside a file or between files. AES GCM Stream can also protect against swapping full files - for example, replacement of a metadata file with an old version. AADs are built to reflect the identity of a file and of the blocks inside the file. AES GCM Stream constructs a block AAD from two components: an AAD prefix - a string provided by Iceberg for the file (with the file ID), and an AAD suffix - the block sequence number in the file, as an int in a 4-byte little-endian form. The block AAD is a direct concatenation of the prefix and suffix parts. diff --git a/format/puffin-spec.md b/format/puffin-spec.md index 06c7ad565dd7..8617ae6d8e96 100644 --- a/format/puffin-spec.md +++ b/format/puffin-spec.md @@ -188,7 +188,6 @@ codecs listed below. For maximal interoperability, other codecs are not supporte |------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| | lz4 | Single [LZ4 compression frame](https://github.com/lz4/lz4/blob/77d1b93f72628af7bbde0243b4bba9205c3138d9/doc/lz4_Frame_format.md), with content size present | | zstd | Single [Zstandard compression frame](https://github.com/facebook/zstd/blob/8af64f41161f6c2e0ba842006fe238c664a6a437/doc/zstd_compression_format.md#zstandard-frames), with content size present | -__ ### Common properties From 88d553899d0a6109a0c55cb5f6c2208106d3c268 Mon Sep 17 00:00:00 2001 From: Eunbin Son <58901024+thswlsqls@users.noreply.github.com> Date: Thu, 2 Apr 2026 06:54:27 +0900 Subject: [PATCH 009/197] Docs: Fix stale version label and missing integrations in mkdocs-dev.yml (#15810) --- site/mkdocs-dev.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/site/mkdocs-dev.yml b/site/mkdocs-dev.yml index 8891eb1a951d..eb5b34c0b274 100644 --- a/site/mkdocs-dev.yml +++ b/site/mkdocs-dev.yml @@ -30,7 +30,7 @@ nav: - Docs: - Java: - Nightly: '!include docs/docs/nightly/mkdocs.yml' - - Latest (1.10.0): '!include docs/docs/latest/mkdocs.yml' + - Latest (1.10.1): '!include docs/docs/latest/mkdocs.yml' - Other Implementations: - Python: https://py.iceberg.apache.org/ - Rust: https://rust.iceberg.apache.org/ @@ -52,6 +52,7 @@ nav: - Apache Amoro: integrations/amoro.md - Apache Doris: https://doris.apache.org/docs/dev/lakehouse/catalogs/iceberg-catalog - Apache Druid: https://druid.apache.org/docs/latest/development/extensions-contrib/iceberg/ + - Apache Fluss: https://fluss.apache.org/docs/next/streaming-lakehouse/integrate-data-lakes/iceberg/ - BladePipe: https://www.bladepipe.com/docs/dataMigrationAndSync/datasource_func/Iceberg/props_for_iceberg_ds - ClickHouse: https://clickhouse.com/docs/en/engines/table-engines/integrations/iceberg - Daft: integrations/daft.md @@ -63,6 +64,7 @@ nav: - Google BigQuery: https://cloud.google.com/bigquery/docs/iceberg-tables - Impala: https://impala.apache.org/docs/build/html/topics/impala_iceberg.html - Memiiso Debezium: https://memiiso.github.io/debezium-server-iceberg/ + - Microsoft OneLake: https://aka.ms/onelakeircdocs - Nimtable: https://github.com/nimtable/nimtable - OLake: https://olake.io/docs - Presto: https://prestodb.io/docs/current/connector/iceberg.html From 245637a62fd1fac1936c43bcb39cb72872f58768 Mon Sep 17 00:00:00 2001 From: Russell Spitzer Date: Wed, 1 Apr 2026 18:43:16 -0500 Subject: [PATCH 010/197] Build: Add runtime dependency guard for bundled artifacts (#15855) Adds a build-time check that prevents accidental transitive dependency leaks into shipped shadow JARs and distribution archives. A checked-in runtime-deps.txt baseline lists every dependency resolved into each bundled artifact. checkRuntimeDeps compares resolved deps against the baseline and fails the build with a clear diff on mismatch, wired into the check lifecycle so it runs in CI automatically. This guards all 11 bundled modules: Spark runtime (3.4, 3.5, 4.0, 4.1), Flink runtime (1.20, 2.0, 2.1), cloud bundles (AWS, Azure, GCP), and Kafka Connect runtime. --- .github/workflows/java-ci.yml | 13 ++++ aws-bundle/build.gradle | 2 + azure-bundle/build.gradle | 2 + build.gradle | 9 +++ dev/.rat-excludes | 1 + flink/v1.20/build.gradle | 2 + flink/v2.0/build.gradle | 2 + flink/v2.1/build.gradle | 2 + gcp-bundle/build.gradle | 2 + kafka-connect/build.gradle | 2 + runtime-deps.gradle | 130 ++++++++++++++++++++++++++++++++++ spark/v3.4/build.gradle | 2 + spark/v3.5/build.gradle | 2 + spark/v4.0/build.gradle | 2 + spark/v4.1/build.gradle | 2 + 15 files changed, 175 insertions(+) create mode 100644 runtime-deps.gradle diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index 4ef0a30b8225..e8ac497ab04a 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -126,3 +126,16 @@ jobs: java-version: ${{ matrix.jvm }} - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle - run: ./gradlew -Pquick=true javadoc + + check-runtime-deps: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + persist-credentials: false + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + with: + distribution: zulu + java-version: 17 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - run: ./gradlew checkAllRuntimeDeps -q diff --git a/aws-bundle/build.gradle b/aws-bundle/build.gradle index 5b9054812a50..c891ac5b439c 100644 --- a/aws-bundle/build.gradle +++ b/aws-bundle/build.gradle @@ -66,4 +66,6 @@ project(":iceberg-aws-bundle") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/azure-bundle/build.gradle b/azure-bundle/build.gradle index 0bdc30fdaa7e..dad563b67ab7 100644 --- a/azure-bundle/build.gradle +++ b/azure-bundle/build.gradle @@ -52,4 +52,6 @@ project(":iceberg-azure-bundle") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/build.gradle b/build.gradle index aeb613c8bade..d5f25bc77154 100644 --- a/build.gradle +++ b/build.gradle @@ -120,6 +120,15 @@ allprojects { } } +tasks.register('checkAllRuntimeDeps') { + description = 'Validates runtime dependency baselines for all subprojects that have them' + group = 'verification' + + dependsOn subprojects.collect { subproject -> + subproject.tasks.matching { it.name == 'checkRuntimeDeps' } + } +} + subprojects { if (it.name == 'iceberg-bom') { // the BOM does not build anything, the code below expects "source code" diff --git a/dev/.rat-excludes b/dev/.rat-excludes index 52a800723598..f94ef5bf8988 100644 --- a/dev/.rat-excludes +++ b/dev/.rat-excludes @@ -29,3 +29,4 @@ sitemap.xml .python-version **/*_index.md **/.venv/** +**/runtime-deps.txt diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index 3591bf37b1a7..772133c8e1d8 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -266,4 +266,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/flink/v2.0/build.gradle b/flink/v2.0/build.gradle index 5907f41b3544..b276cb90dd24 100644 --- a/flink/v2.0/build.gradle +++ b/flink/v2.0/build.gradle @@ -266,4 +266,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index 91081bdc2e42..a08cb1d5ebdd 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -266,4 +266,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/gcp-bundle/build.gradle b/gcp-bundle/build.gradle index 6ebe05ccdbce..1f6642c9b2ce 100644 --- a/gcp-bundle/build.gradle +++ b/gcp-bundle/build.gradle @@ -59,4 +59,6 @@ project(":iceberg-gcp-bundle") { jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/kafka-connect/build.gradle b/kafka-connect/build.gradle index e93ad8641f79..009ae719bac5 100644 --- a/kafka-connect/build.gradle +++ b/kafka-connect/build.gradle @@ -254,6 +254,8 @@ project(':iceberg-kafka-connect:iceberg-kafka-connect-runtime') { check.dependsOn integrationTest assemble.dependsOn distZip, hiveDistZip + + apply from: "${rootDir}/runtime-deps.gradle" } project(':iceberg-kafka-connect:iceberg-kafka-connect-transforms') { diff --git a/runtime-deps.gradle b/runtime-deps.gradle new file mode 100644 index 000000000000..e08fce3fe47a --- /dev/null +++ b/runtime-deps.gradle @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Guards the runtime dependency surface for shadow JAR modules. +// +// Prevents accidental transitive dependency growth in shipped shadow JARs. +// Without this guard, adding a single catalog module as 'implementation' +// instead of 'compileOnly' can silently leak dozens of transitive artifacts +// into the runtime JAR, inflating its size and introducing unlicensed code. +// +// Apply this script in any project that ships a bundled artifact: Spark and +// Flink runtime shadow JARs, cloud bundles (aws, azure, gcp), and Kafka +// Connect runtime distribution. +// +// It adds two tasks: +// +// generateRuntimeDeps - resolves runtimeClasspath and writes a sorted +// baseline of group:artifact:version coordinates +// to runtime-deps.txt in the project directory. +// +// checkRuntimeDeps - compares the resolved dependencies against the +// checked-in baseline and fails with a diff if +// they don't match. Patch-level version changes are +// ignored so that routine Dependabot bumps don't +// require a baseline update. Wired into the 'check' +// lifecycle. +// +// Workflow: +// 1. ./gradlew check -- fails if deps changed +// 2. ./gradlew generateRuntimeDeps -- auto-updates all baselines +// 3. Update LICENSE and NOTICE if dependency licenses changed -- This is a Manual Step +// 4. Commit + +def depsFile = file("${projectDir}/runtime-deps.txt") + +def resolveRuntimeDeps = { + configurations.runtimeClasspath.resolvedConfiguration + .resolvedArtifacts + .collect { "${it.moduleVersion.id.group}:${it.moduleVersion.id.name}:${it.moduleVersion.id.version}" } + .findAll { !it.startsWith('org.apache.iceberg:') } + .toSorted() + .toUnique() +} + +tasks.register('generateRuntimeDeps') { + group = 'verification' + description = 'Regenerate the runtime dependency baseline after intentional dependency changes' + outputs.file(depsFile) + doLast { + def deps = resolveRuntimeDeps() + depsFile.text = deps.join('\n') + '\n' + logger.lifecycle("Wrote ${deps.size()} dependencies to ${depsFile}") + logger.lifecycle("Review the diff, then update LICENSE and NOTICE if licenses changed.") + } +} + +tasks.register('checkRuntimeDeps') { + group = 'verification' + description = 'Verify runtime dependencies match the checked-in baseline' + inputs.files(configurations.runtimeClasspath) + outputs.file(depsFile) + doLast { + if (!depsFile.exists()) { + logger.warn("WARNING: Missing ${depsFile.name} in ${projectDir}. " + + "Run: ./gradlew ${project.path}:generateRuntimeDeps") + return + } + + def actual = resolveRuntimeDeps() + def expected = depsFile.readLines().findAll { it.trim() }.toSorted() + + def groupArtifact = { coord -> coord.substring(0, coord.lastIndexOf(':')) } + def majorMinor = { coord -> + def ver = coord.substring(coord.lastIndexOf(':') + 1) + def parts = ver.split('\\.') + parts.length >= 2 ? "${parts[0]}.${parts[1]}" : ver + } + + def actualByModule = actual.collectEntries { [(groupArtifact(it)): it] } + def expectedByModule = expected.collectEntries { [(groupArtifact(it)): it] } + + def added = actualByModule.keySet() - expectedByModule.keySet() + def removed = expectedByModule.keySet() - actualByModule.keySet() + def shared = actualByModule.keySet().intersect(expectedByModule.keySet()) + def versionChanged = shared.findAll { + majorMinor(actualByModule[it]) != majorMinor(expectedByModule[it]) + } + + if (added || removed || versionChanged) { + def msg = new StringBuilder() + msg.append("Runtime dependency baseline mismatch for ${project.name}!\n") + if (versionChanged) { + msg.append("\n Version changed (${versionChanged.size()}):\n") + versionChanged.toSorted().each { module -> + msg.append(" ~ ${expectedByModule[module]} -> ${actualByModule[module]}\n") + } + } + if (added) { + msg.append("\n Added (${added.size()}):\n") + added.toSorted().each { module -> msg.append(" + ${actualByModule[module]}\n") } + } + if (removed) { + msg.append("\n Removed (${removed.size()}):\n") + removed.toSorted().each { module -> msg.append(" - ${expectedByModule[module]}\n") } + } + msg.append("\nTo update the baseline run:\n") + msg.append(" ./gradlew ${project.path}:generateRuntimeDeps\n") + msg.append("\nThen update LICENSE and NOTICE to reflect the dependency changes.") + throw new GradleException(msg.toString()) + } + } +} + +check.dependsOn checkRuntimeDeps diff --git a/spark/v3.4/build.gradle b/spark/v3.4/build.gradle index bfe84b08dfa4..bb8270e3d303 100644 --- a/spark/v3.4/build.gradle +++ b/spark/v3.4/build.gradle @@ -333,5 +333,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index 2fe3deb0ce5e..18fca51be251 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle @@ -335,5 +335,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/spark/v4.0/build.gradle b/spark/v4.0/build.gradle index acc4d7529f37..62111e104e26 100644 --- a/spark/v4.0/build.gradle +++ b/spark/v4.0/build.gradle @@ -335,5 +335,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } diff --git a/spark/v4.1/build.gradle b/spark/v4.1/build.gradle index 6a46cb4b2063..355a85ab81a9 100644 --- a/spark/v4.1/build.gradle +++ b/spark/v4.1/build.gradle @@ -335,5 +335,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio jar { enabled = false } + + apply from: "${rootDir}/runtime-deps.gradle" } From e8b619148b557c5d72522e7f24bdadc9170eabfe Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Wed, 1 Apr 2026 19:44:39 -0700 Subject: [PATCH 011/197] Aliyun: Remove leaked transitive dependencies. (#15858) --- build.gradle | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.gradle b/build.gradle index d5f25bc77154..1ff423618378 100644 --- a/build.gradle +++ b/build.gradle @@ -468,8 +468,8 @@ project(':iceberg-aliyun') { implementation project(':iceberg-common') compileOnly libs.aliyun.sdk.oss - implementation libs.aliyun.credentials.java - implementation libs.aliyun.tea + compileOnly libs.aliyun.credentials.java + compileOnly libs.aliyun.tea compileOnly libs.jaxb.api compileOnly libs.activation compileOnly libs.jaxb.runtime From 3e22f850bd21f3f8f4ecb340a5d4ffe468fb1ced Mon Sep 17 00:00:00 2001 From: Atsuo Yamaguchi Date: Wed, 1 Apr 2026 21:32:29 -0700 Subject: [PATCH 012/197] Docs: Fix missing semicolons in Java API Quickstart imports (#15864) --- docs/docs/java-api-quickstart.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/docs/java-api-quickstart.md b/docs/docs/java-api-quickstart.md index 430450fc87c2..31f97bd441c8 100644 --- a/docs/docs/java-api-quickstart.md +++ b/docs/docs/java-api-quickstart.md @@ -31,8 +31,8 @@ You can initialize a Hive catalog with a name and some properties. (see: [Catalog properties](configuration.md#catalog-properties)) ```java -import java.util.HashMap -import java.util.Map +import java.util.HashMap; +import java.util.Map; import org.apache.iceberg.hive.HiveCatalog; From 1a2a8a5ecc701629183405ca842f2d1eb23505ee Mon Sep 17 00:00:00 2001 From: jbewing Date: Thu, 2 Apr 2026 12:54:55 -0400 Subject: [PATCH 013/197] Spark (4.0, 3.5): Set data file sort_order_id in manifest for writes from Spark (#15832) --- .../extensions/TestCopyOnWriteDelete.java | 19 +++ .../extensions/TestCopyOnWriteMerge.java | 28 +++++ .../extensions/TestCopyOnWriteUpdate.java | 19 +++ .../extensions/TestMergeOnReadMerge.java | 29 +++++ .../extensions/TestMergeOnReadUpdate.java | 20 +++ .../apache/iceberg/spark/SparkWriteConf.java | 20 +++ .../iceberg/spark/SparkWriteOptions.java | 1 + .../SparkShufflingFileRewriteRunner.java | 16 +++ .../spark/source/SparkPositionDeltaWrite.java | 11 +- .../iceberg/spark/source/SparkWrite.java | 10 +- .../iceberg/spark/TestSparkWriteConf.java | 46 +++++++ .../actions/TestRewriteDataFilesAction.java | 57 ++++++++- .../spark/source/TestSparkDataWrite.java | 115 ++++++++++++++++++ .../spark/source/TestStructuredStreaming.java | 48 ++++++++ .../extensions/TestCopyOnWriteDelete.java | 19 +++ .../extensions/TestCopyOnWriteMerge.java | 28 +++++ .../extensions/TestCopyOnWriteUpdate.java | 19 +++ .../extensions/TestMergeOnReadMerge.java | 29 +++++ .../extensions/TestMergeOnReadUpdate.java | 20 +++ .../apache/iceberg/spark/SparkWriteConf.java | 20 +++ .../iceberg/spark/SparkWriteOptions.java | 1 + .../SparkShufflingFileRewriteRunner.java | 16 +++ .../spark/source/SparkPositionDeltaWrite.java | 11 +- .../iceberg/spark/source/SparkWrite.java | 10 +- .../iceberg/spark/TestSparkWriteConf.java | 46 +++++++ .../actions/TestRewriteDataFilesAction.java | 57 ++++++++- .../spark/source/TestSparkDataWrite.java | 115 ++++++++++++++++++ .../spark/source/TestStructuredStreaming.java | 48 ++++++++ 28 files changed, 860 insertions(+), 18 deletions(-) diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index f7ded0c4d7d2..d39dff060c9a 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -162,6 +162,25 @@ public synchronized void testDeleteWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteDeleteSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("DELETE FROM %s WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithPreservedDataGrouping() throws NoSuchTableException { createAndInitPartitionedTable(); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index fef8b28c689a..394dbbda1a3d 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -151,6 +151,34 @@ public synchronized void testMergeWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteMergeSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable("id INT, dep STRING"); + sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + createBranchIfNeeded(); + + createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); + + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'changed' " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'new')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index 21d1377b2b98..b547218acbd4 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -149,6 +149,25 @@ public synchronized void testUpdateWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteUpdateSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET dep = 'changed' WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java index 737f19e86a95..9a42b58e3434 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java @@ -26,6 +26,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -136,6 +137,34 @@ public void testMergeWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadMergeSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + createOrReplaceView("source", ImmutableList.of(1, 3), Encoders.INT()); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET id = id + 10 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'hr')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void checkMergeDeleteGranularity(DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity( "id INT, dep STRING", "PARTITIONED BY (dep)", deleteGranularity); diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java index 2398bc45b19b..d1c336d5ddeb 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -224,6 +225,25 @@ public void testUpdateWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadUpdateSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET id = id + 10 WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void initTable(String partitionedBy, DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity("id INT, dep STRING", partitionedBy, deleteGranularity); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index b3e8af5fe056..9da48ae51e5c 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -42,6 +42,7 @@ import org.apache.iceberg.FileFormat; import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableUtil; @@ -162,6 +163,25 @@ public int outputSpecId() { return outputSpecId; } + public int outputSortOrderId(SparkWriteRequirements writeRequirements) { + Integer explicitId = + confParser.intConf().option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID).parseOptional(); + + if (explicitId != null) { + Preconditions.checkArgument( + table.sortOrders().containsKey(explicitId), + "Cannot use output sort order id %s because the table does not contain a sort order with that id", + explicitId); + return explicitId; + } + + if (writeRequirements.hasOrdering()) { + return table.sortOrder().orderId(); + } + + return SortOrder.unsorted().orderId(); + } + public FileFormat dataFileFormat() { String valueAsString = confParser diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 33db70bae587..1be02feaf0c0 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -54,6 +54,7 @@ private SparkWriteOptions() {} public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; public static final String OUTPUT_SPEC_ID = "output-spec-id"; + public static final String OUTPUT_SORT_ORDER_ID = "output-sort-order-id"; public static final String OVERWRITE_MODE = "overwrite-mode"; diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java index b1c5a5c0901a..346abaee5e63 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java @@ -47,10 +47,14 @@ import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.write.RequiresDistributionAndOrdering; import org.apache.spark.sql.execution.datasources.v2.DistributionAndOrderingUtils$; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.Option; abstract class SparkShufflingFileRewriteRunner extends SparkDataFileRewriteRunner { + private static final Logger LOG = LoggerFactory.getLogger(SparkShufflingFileRewriteRunner.class); + /** * The number of shuffle partitions to use for each output file. By default, this file rewriter * assumes each shuffle partition would become a separate output file. Attempting to generate @@ -119,6 +123,17 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { spec(fileGroup.outputSpecId()), fileGroup.expectedOutputFiles())); + org.apache.iceberg.SortOrder sortOrderInJobSpec = sortOrder(); + + org.apache.iceberg.SortOrder maybeMatchingTableSortOrder = + SortOrderUtil.findTableSortOrder(table(), sortOrder()); + + if (sortOrderInJobSpec.isSorted() && maybeMatchingTableSortOrder.isUnsorted()) { + LOG.warn( + "Sort order specified for job {} doesn't match any table sort orders, rewritten files will not be marked as sorted in the manifest files", + Spark3Util.describe(sortOrderInJobSpec)); + } + sortedDF .write() .format("iceberg") @@ -126,6 +141,7 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, fileGroup.maxOutputFileSize()) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .option(SparkWriteOptions.OUTPUT_SPEC_ID, fileGroup.outputSpecId()) + .option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, maybeMatchingTableSortOrder.orderId()) .mode("append") .save(groupId); } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java index ddad1a749aa9..f926bd96389a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java @@ -110,6 +110,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde private final String branch; private final Map extraSnapshotMetadata; private final SparkWriteRequirements writeRequirements; + private final int sortOrderId; private final Context context; private final Map writeProperties; @@ -135,6 +136,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde this.branch = writeConf.branch(); this.extraSnapshotMetadata = writeConf.extraSnapshotMetadata(); this.writeRequirements = writeConf.positionDeltaRequirements(command); + this.sortOrderId = writeConf.outputSortOrderId(writeRequirements); this.context = new Context(dataSchema, writeConf, info, writeRequirements); this.writeProperties = writeConf.writeProperties(); } @@ -180,7 +182,8 @@ public DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) { broadcastRewritableDeletes(), command, context, - writeProperties); + writeProperties, + sortOrderId); } private Broadcast> broadcastRewritableDeletes() { @@ -390,18 +393,21 @@ private static class PositionDeltaWriteFactory implements DeltaWriterFactory { private final Command command; private final Context context; private final Map writeProperties; + private final int sortOrderId; PositionDeltaWriteFactory( Broadcast tableBroadcast, Broadcast> rewritableDeletesBroadcast, Command command, Context context, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.rewritableDeletesBroadcast = rewritableDeletesBroadcast; this.command = command; this.context = context; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -428,6 +434,7 @@ public DeltaWriter createWriter(int partitionId, long taskId) { .deleteFileFormat(context.deleteFileFormat()) .positionDeleteSparkType(context.deleteSparkType()) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); if (command == DELETE) { diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index 15c70e4a6621..aff8864b6d2a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -193,6 +193,7 @@ private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors Broadcast
tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + int sortOrderId = writeConf.outputSortOrderId(writeRequirements); return new WriterFactory( tableBroadcast, queryId, @@ -202,7 +203,8 @@ private WriterFactory createWriterFactory() { writeSchema, dsSchema, useFanoutWriter, - writeProperties); + writeProperties, + sortOrderId); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -672,6 +674,7 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final boolean useFanoutWriter; private final String queryId; private final Map writeProperties; + private final int sortOrderId; protected WriterFactory( Broadcast
tableBroadcast, @@ -682,7 +685,8 @@ protected WriterFactory( Schema writeSchema, StructType dsSchema, boolean useFanoutWriter, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.format = format; this.outputSpecId = outputSpecId; @@ -692,6 +696,7 @@ protected WriterFactory( this.useFanoutWriter = useFanoutWriter; this.queryId = queryId; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -716,6 +721,7 @@ public DataWriter createWriter(int partitionId, long taskId, long e .dataSchema(writeSchema) .dataSparkType(dsSchema) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); if (spec.isUnpartitioned()) { diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index a9b5d1a237b4..89daf195ca73 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -45,6 +45,7 @@ import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.time.Duration; @@ -552,6 +553,51 @@ public void testDVWriteConf() { assertThat(writeConf.deleteFileFormat()).isEqualTo(FileFormat.PUFFIN); } + @TestTemplate + public void testSortOrderWriteConf() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConf = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "1")); + + assertThat(writeConf.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .isEqualTo(table.sortOrder().orderId()); + } + + @TestTemplate + public void testSortOrderWriteConfWithInvalidId() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfForUnknownSortOrder = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "999")); + + assertThatIllegalArgumentException() + .isThrownBy( + () -> writeConfForUnknownSortOrder.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .withMessage( + "Cannot use output sort order id 999 because the table does not contain a sort order with that id"); + } + + @TestTemplate + public void testSortOrderWriteConfWithNoOption() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfNoOption = new SparkWriteConf(spark, table, ImmutableMap.of()); + + assertThat(writeConfNoOption.outputSortOrderId(writeConfNoOption.writeRequirements())) + .isEqualTo(table.sortOrder().orderId()); + + assertThat(writeConfNoOption.outputSortOrderId(SparkWriteRequirements.EMPTY)).isEqualTo(0); + } + private void testWriteProperties(List> propertiesSuite) { withSQLConf( propertiesSuite.get(0), diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 411b7e78116f..bcaa40d13c8a 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -1514,7 +1514,7 @@ public void testSortMultipleGroups() { } @TestTemplate - public void testSimpleSort() { + public void testSimpleSort() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.replaceSortOrder().asc("c2").commit(); @@ -1542,10 +1542,11 @@ public void testSimpleSort() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortAfterPartitionChange() { + public void testSortAfterPartitionChange() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.updateSpec().addField(Expressions.bucket("c1", 4)).commit(); @@ -1576,10 +1577,11 @@ public void testSortAfterPartitionChange() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortCustomSortOrder() { + public void testSortCustomSortOrder() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1605,10 +1607,11 @@ public void testSortCustomSortOrder() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testSortCustomSortOrderRequiresRepartition() { + public void testSortCustomSortOrderRequiresRepartition() throws IOException { int partitions = 4; Table table = createTable(); writeRecords(20, SCALE, partitions); @@ -1644,10 +1647,40 @@ public void testSortCustomSortOrderRequiresRepartition() { shouldHaveMultipleFiles(table); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveLastCommitSorted(table, "c3"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testAutoSortShuffleOutput() { + public void testSortPastTableSortOrderGetsAppliedToFiles() throws IOException { + Table table = createTable(1); + + table.replaceSortOrder().asc("c3").commit(); + SortOrder c3SortOrder = table.sortOrder(); + + table.replaceSortOrder().asc("c2").commit(); + + List originalData = currentData(); + + RewriteDataFiles.Result result = + basicRewrite(table) + .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .execute(); + + assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); + + table.refresh(); + + List postRewriteData = currentData(); + assertEquals("We shouldn't have changed the data", originalData, postRewriteData); + + shouldHaveSnapshots(table, 2); + shouldHaveACleanCache(table); + dataFilesShouldHaveSortOrderIdMatching(table, c3SortOrder); + } + + @TestTemplate + public void testAutoSortShuffleOutput() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1682,6 +1715,7 @@ public void testAutoSortShuffleOutput() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate @@ -2600,4 +2634,17 @@ public boolean matches(RewriteFileGroup argument) { return groupIDs.contains(argument.info().globalIndex()); } } + + private void dataFilesSortOrderShouldMatchTableSortOrder(Table table) throws IOException { + dataFilesShouldHaveSortOrderIdMatching(table, table.sortOrder()); + } + + private void dataFilesShouldHaveSortOrderIdMatching(Table table, SortOrder sortOrder) + throws IOException { + try (CloseableIterable files = table.newScan().planFiles()) { + assertThat(files) + .extracting(fileScanTask -> fileScanTask.file().sortOrderId()) + .containsOnly(sortOrder.orderId()); + } + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 2ad84f41fd87..81cb54cceabc 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -27,6 +27,7 @@ import static org.mockito.Mockito.when; import java.io.File; +import java.io.IOException; import java.net.InetAddress; import java.nio.file.Path; import java.util.List; @@ -35,6 +36,7 @@ import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.Parameter; @@ -43,10 +45,12 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.exceptions.CommitStateUnknownException; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -153,6 +157,7 @@ public void testBasicWrite() { assertThat(file.splitOffsets()).as("Split offsets not present").isNotNull(); } assertThat(file.recordCount()).as("Should have reported record count as 1").isEqualTo(1); + assertThat(file.sortOrderId()).isEqualTo(SortOrder.unsorted().orderId()); // TODO: append more metric info if (format.equals(FileFormat.PARQUET)) { assertThat(file.columnSizes()).as("Column sizes metric not present").isNotNull(); @@ -473,6 +478,116 @@ public void testViewsReturnRecentResults() { assertThat(actual2).hasSameSizeAs(expected2).isEqualTo(expected2); } + @TestTemplate + public void testWriteDataFilesInTableSortOrder() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").build(); + Table table = tables.create(SCHEMA, spec, sortOrder, ImmutableMap.of(), location.toString()); + + List expected = Lists.newArrayListWithCapacity(10); + for (int i = 0; i < 10; i++) { + expected.add(new SimpleRecord(i, "a")); + } + + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + Dataset result = spark.read().format("iceberg").load(location.toString()); + + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + assertThat(actual).hasSameSizeAs(expected).isEqualTo(expected); + + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + assertThat(fileScanTasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesUnsortedTable() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List expected = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles should have unsorted sort order id") + .containsOnly(SortOrder.unsorted().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesAfterSortOrderChange() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List records = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(records, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + int unsortedId = SortOrder.unsorted().orderId(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).extracting(task -> task.file().sortOrderId()).containsOnly(unsortedId); + } + + table.replaceSortOrder().asc("id").commit(); + int sortedId = table.sortOrder().orderId(); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("Should contain both unsorted and sorted files") + .containsOnly(unsortedId, sortedId); + } + } + public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType option) { File parent = temp.resolve(format.toString()).toFile(); File location = new File(parent, "test"); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 54048bbf218a..2a178a74137c 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -29,10 +29,14 @@ import java.nio.file.Paths; import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -263,6 +267,50 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { } } + @Test + public void testStreamingWriteDataFilesInTableSortOrder() throws Exception { + File parent = temp.resolve("parquet").toFile(); + File location = new File(parent, "test-table"); + File checkpoint = new File(parent, "checkpoint"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").build(); + Table table = tables.create(SCHEMA, spec, sortOrder, ImmutableMap.of(), location.toString()); + + MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); + + try { + StreamingQuery query = streamWriter.start(); + List batch1 = Lists.newArrayList(1, 2); + send(batch1, inputStream); + query.processAllAvailable(); + query.stop(); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } finally { + for (StreamingQuery query : spark.streams().active()) { + query.stop(); + } + } + } + @Test public void testStreamingWriteUpdateMode() throws Exception { File parent = temp.resolve("parquet").toFile(); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java index f7ded0c4d7d2..d39dff060c9a 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteDelete.java @@ -162,6 +162,25 @@ public synchronized void testDeleteWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteDeleteSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("DELETE FROM %s WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithPreservedDataGrouping() throws NoSuchTableException { createAndInitPartitionedTable(); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java index fef8b28c689a..394dbbda1a3d 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteMerge.java @@ -151,6 +151,34 @@ public synchronized void testMergeWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteMergeSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable("id INT, dep STRING"); + sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName); + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + createBranchIfNeeded(); + + createOrReplaceView("source", Collections.singletonList(1), Encoders.INT()); + + sql( + "MERGE INTO %s t USING source s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET dep = 'changed' " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'new')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java index 21d1377b2b98..b547218acbd4 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestCopyOnWriteUpdate.java @@ -149,6 +149,25 @@ public synchronized void testUpdateWithConcurrentTableRefresh() throws Exception assertThat(executorService.awaitTermination(2, TimeUnit.MINUTES)).as("Timeout").isTrue(); } + @TestTemplate + public void testCopyOnWriteUpdateSetsSortOrderIdOnRewrittenDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET dep = 'changed' WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("Rewritten data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + @TestTemplate public void testRuntimeFilteringWithReportedPartitioning() { createAndInitTable("id INT, dep STRING"); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java index 737f19e86a95..9a42b58e3434 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadMerge.java @@ -26,6 +26,7 @@ import java.util.Set; import java.util.stream.Collectors; import java.util.stream.IntStream; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -136,6 +137,34 @@ public void testMergeWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadMergeSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + createOrReplaceView("source", ImmutableList.of(1, 3), Encoders.INT()); + + sql( + "MERGE INTO %s AS t USING source AS s " + + "ON t.id == s.value " + + "WHEN MATCHED THEN " + + " UPDATE SET id = id + 10 " + + "WHEN NOT MATCHED THEN " + + " INSERT (id, dep) VALUES (s.value, 'hr')", + commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void checkMergeDeleteGranularity(DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity( "id INT, dep STRING", "PARTITIONED BY (dep)", deleteGranularity); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java index 2398bc45b19b..d1c336d5ddeb 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestMergeOnReadUpdate.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; import org.apache.iceberg.ParameterizedTestExtension; @@ -224,6 +225,25 @@ public void testUpdateWithDVAndHistoricalPositionDeletes() { assertThat(dvs).allMatch(dv -> FileFormat.fromFileName(dv.location()) == FileFormat.PUFFIN); } + @TestTemplate + public void testMergeOnReadUpdateSetsSortOrderIdOnNewDataFiles() { + createAndInitTable( + "id INT, dep STRING", + "PARTITIONED BY (dep)", + "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 2, \"dep\": \"hr\" }"); + + sql("ALTER TABLE %s WRITE ORDERED BY id", tableName); + + sql("UPDATE %s SET id = id + 10 WHERE id = 1", commitTarget()); + + Table table = validationCatalog.loadTable(tableIdent); + Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch); + assertThat(snapshot.addedDataFiles(table.io())) + .extracting(DataFile::sortOrderId) + .as("All new data files should carry the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + private void initTable(String partitionedBy, DeleteGranularity deleteGranularity) { createTableWithDeleteGranularity("id INT, dep STRING", partitionedBy, deleteGranularity); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 96131e0e56dd..aba7e4dda082 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -42,6 +42,7 @@ import org.apache.iceberg.FileFormat; import org.apache.iceberg.IsolationLevel; import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableUtil; @@ -164,6 +165,25 @@ public int outputSpecId() { return outputSpecId; } + public int outputSortOrderId(SparkWriteRequirements writeRequirements) { + Integer explicitId = + confParser.intConf().option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID).parseOptional(); + + if (explicitId != null) { + Preconditions.checkArgument( + table.sortOrders().containsKey(explicitId), + "Cannot use output sort order id %s because the table does not contain a sort order with that id", + explicitId); + return explicitId; + } + + if (writeRequirements.hasOrdering()) { + return table.sortOrder().orderId(); + } + + return SortOrder.unsorted().orderId(); + } + public FileFormat dataFileFormat() { String valueAsString = confParser diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 33db70bae587..1be02feaf0c0 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -54,6 +54,7 @@ private SparkWriteOptions() {} public static final String REWRITTEN_FILE_SCAN_TASK_SET_ID = "rewritten-file-scan-task-set-id"; public static final String OUTPUT_SPEC_ID = "output-spec-id"; + public static final String OUTPUT_SORT_ORDER_ID = "output-sort-order-id"; public static final String OVERWRITE_MODE = "overwrite-mode"; diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java index 569eb252cba5..f1d45a4b142b 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkShufflingFileRewriteRunner.java @@ -47,10 +47,14 @@ import org.apache.spark.sql.connector.expressions.SortOrder; import org.apache.spark.sql.connector.write.RequiresDistributionAndOrdering; import org.apache.spark.sql.execution.datasources.v2.DistributionAndOrderingUtils$; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import scala.Option; abstract class SparkShufflingFileRewriteRunner extends SparkDataFileRewriteRunner { + private static final Logger LOG = LoggerFactory.getLogger(SparkShufflingFileRewriteRunner.class); + /** * The number of shuffle partitions to use for each output file. By default, this file rewriter * assumes each shuffle partition would become a separate output file. Attempting to generate @@ -119,6 +123,17 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { spec(fileGroup.outputSpecId()), fileGroup.expectedOutputFiles())); + org.apache.iceberg.SortOrder sortOrderInJobSpec = sortOrder(); + + org.apache.iceberg.SortOrder maybeMatchingTableSortOrder = + SortOrderUtil.findTableSortOrder(table(), sortOrder()); + + if (sortOrderInJobSpec.isSorted() && maybeMatchingTableSortOrder.isUnsorted()) { + LOG.warn( + "Sort order specified for job {} doesn't match any table sort orders, rewritten files will not be marked as sorted in the manifest files", + Spark3Util.describe(sortOrderInJobSpec)); + } + sortedDF .write() .format("iceberg") @@ -126,6 +141,7 @@ public void doRewrite(String groupId, RewriteFileGroup fileGroup) { .option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, fileGroup.maxOutputFileSize()) .option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false") .option(SparkWriteOptions.OUTPUT_SPEC_ID, fileGroup.outputSpecId()) + .option(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, maybeMatchingTableSortOrder.orderId()) .mode("append") .save(groupId); } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java index a1cb31bd3720..f0a58fc42107 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWrite.java @@ -117,6 +117,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde private final String branch; private final Map extraSnapshotMetadata; private final SparkWriteRequirements writeRequirements; + private final int sortOrderId; private final Context context; private final Map writeProperties; @@ -143,6 +144,7 @@ class SparkPositionDeltaWrite implements DeltaWrite, RequiresDistributionAndOrde this.branch = writeConf.branch(); this.extraSnapshotMetadata = writeConf.extraSnapshotMetadata(); this.writeRequirements = writeConf.positionDeltaRequirements(command); + this.sortOrderId = writeConf.outputSortOrderId(writeRequirements); this.context = new Context(dataSchema, writeConf, info, writeRequirements); this.writeProperties = writeConf.writeProperties(); @@ -203,7 +205,8 @@ public DeltaWriterFactory createBatchWriterFactory(PhysicalWriteInfo info) { broadcastRewritableDeletes(), command, context, - writeProperties); + writeProperties, + sortOrderId); } private Broadcast> broadcastRewritableDeletes() { @@ -413,18 +416,21 @@ private static class PositionDeltaWriteFactory implements DeltaWriterFactory { private final Command command; private final Context context; private final Map writeProperties; + private final int sortOrderId; PositionDeltaWriteFactory( Broadcast
tableBroadcast, Broadcast> rewritableDeletesBroadcast, Command command, Context context, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.rewritableDeletesBroadcast = rewritableDeletesBroadcast; this.command = command; this.context = context; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -451,6 +457,7 @@ public DeltaWriter createWriter(int partitionId, long taskId) { .deleteFileFormat(context.deleteFileFormat()) .positionDeleteSparkType(context.deleteSparkType()) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); if (command == DELETE) { diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java index 5f81689f41ed..c73a37ba3426 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java @@ -212,6 +212,7 @@ private WriterFactory createWriterFactory() { // broadcast the table metadata as the writer factory will be sent to executors Broadcast
tableBroadcast = sparkContext.broadcast(SerializableTableWithSize.copyOf(table)); + int sortOrderId = writeConf.outputSortOrderId(writeRequirements); return new WriterFactory( tableBroadcast, queryId, @@ -221,7 +222,8 @@ private WriterFactory createWriterFactory() { writeSchema, dsSchema, useFanoutWriter, - writeProperties); + writeProperties, + sortOrderId); } private void commitOperation(SnapshotUpdate operation, String description) { @@ -696,6 +698,7 @@ private static class WriterFactory implements DataWriterFactory, StreamingDataWr private final boolean useFanoutWriter; private final String queryId; private final Map writeProperties; + private final int sortOrderId; protected WriterFactory( Broadcast
tableBroadcast, @@ -706,7 +709,8 @@ protected WriterFactory( Schema writeSchema, StructType dsSchema, boolean useFanoutWriter, - Map writeProperties) { + Map writeProperties, + int sortOrderId) { this.tableBroadcast = tableBroadcast; this.format = format; this.outputSpecId = outputSpecId; @@ -716,6 +720,7 @@ protected WriterFactory( this.useFanoutWriter = useFanoutWriter; this.queryId = queryId; this.writeProperties = writeProperties; + this.sortOrderId = sortOrderId; } @Override @@ -740,6 +745,7 @@ public DataWriter createWriter(int partitionId, long taskId, long e .dataSchema(writeSchema) .dataSparkType(dsSchema) .writeProperties(writeProperties) + .dataSortOrder(table.sortOrders().get(sortOrderId)) .build(); Function rowLineageExtractor = new ExtractRowLineage(writeSchema); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index 61aacfa4589d..c83b1b6e26ac 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -45,6 +45,7 @@ import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.MERGE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.UPDATE; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatIllegalArgumentException; import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.time.Duration; @@ -600,6 +601,51 @@ public void testDVWriteConf() { assertThat(writeConf.deleteFileFormat()).isEqualTo(FileFormat.PUFFIN); } + @TestTemplate + public void testSortOrderWriteConf() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConf = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "1")); + + assertThat(writeConf.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .isEqualTo(table.sortOrder().orderId()); + } + + @TestTemplate + public void testSortOrderWriteConfWithInvalidId() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfForUnknownSortOrder = + new SparkWriteConf( + spark, table, ImmutableMap.of(SparkWriteOptions.OUTPUT_SORT_ORDER_ID, "999")); + + assertThatIllegalArgumentException() + .isThrownBy( + () -> writeConfForUnknownSortOrder.outputSortOrderId(SparkWriteRequirements.EMPTY)) + .withMessage( + "Cannot use output sort order id 999 because the table does not contain a sort order with that id"); + } + + @TestTemplate + public void testSortOrderWriteConfWithNoOption() { + Table table = validationCatalog.loadTable(tableIdent); + + table.replaceSortOrder().asc("id").commit(); + + SparkWriteConf writeConfNoOption = new SparkWriteConf(spark, table, ImmutableMap.of()); + + assertThat(writeConfNoOption.outputSortOrderId(writeConfNoOption.writeRequirements())) + .isEqualTo(table.sortOrder().orderId()); + + assertThat(writeConfNoOption.outputSortOrderId(SparkWriteRequirements.EMPTY)).isEqualTo(0); + } + private void testWriteProperties(List> propertiesSuite) { withSQLConf( propertiesSuite.get(0), diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 1645d0c84e35..b18f20c44427 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -1517,7 +1517,7 @@ public void testSortMultipleGroups() { } @TestTemplate - public void testSimpleSort() { + public void testSimpleSort() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.replaceSortOrder().asc("c2").commit(); @@ -1545,10 +1545,11 @@ public void testSimpleSort() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortAfterPartitionChange() { + public void testSortAfterPartitionChange() throws IOException { Table table = createTable(20); shouldHaveFiles(table, 20); table.updateSpec().addField(Expressions.bucket("c1", 4)).commit(); @@ -1579,10 +1580,11 @@ public void testSortAfterPartitionChange() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesSortOrderShouldMatchTableSortOrder(table); } @TestTemplate - public void testSortCustomSortOrder() { + public void testSortCustomSortOrder() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1608,10 +1610,11 @@ public void testSortCustomSortOrder() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testSortCustomSortOrderRequiresRepartition() { + public void testSortCustomSortOrderRequiresRepartition() throws IOException { int partitions = 4; Table table = createTable(); writeRecords(20, SCALE, partitions); @@ -1647,10 +1650,40 @@ public void testSortCustomSortOrderRequiresRepartition() { shouldHaveMultipleFiles(table); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveLastCommitSorted(table, "c3"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate - public void testAutoSortShuffleOutput() { + public void testSortPastTableSortOrderGetsAppliedToFiles() throws IOException { + Table table = createTable(1); + + table.replaceSortOrder().asc("c3").commit(); + SortOrder c3SortOrder = table.sortOrder(); + + table.replaceSortOrder().asc("c2").commit(); + + List originalData = currentData(); + + RewriteDataFiles.Result result = + basicRewrite(table) + .sort(SortOrder.builderFor(table.schema()).asc("c3").build()) + .option(SizeBasedFileRewritePlanner.REWRITE_ALL, "true") + .execute(); + + assertThat(result.rewriteResults()).as("Should have 1 fileGroups").hasSize(1); + + table.refresh(); + + List postRewriteData = currentData(); + assertEquals("We shouldn't have changed the data", originalData, postRewriteData); + + shouldHaveSnapshots(table, 2); + shouldHaveACleanCache(table); + dataFilesShouldHaveSortOrderIdMatching(table, c3SortOrder); + } + + @TestTemplate + public void testAutoSortShuffleOutput() throws IOException { Table table = createTable(20); shouldHaveLastCommitUnsorted(table, "c2"); shouldHaveFiles(table, 20); @@ -1685,6 +1718,7 @@ public void testAutoSortShuffleOutput() { shouldHaveACleanCache(table); shouldHaveMultipleFiles(table); shouldHaveLastCommitSorted(table, "c2"); + dataFilesShouldHaveSortOrderIdMatching(table, SortOrder.unsorted()); } @TestTemplate @@ -2623,4 +2657,17 @@ public boolean matches(RewriteFileGroup argument) { return groupIDs.contains(argument.info().globalIndex()); } } + + private void dataFilesSortOrderShouldMatchTableSortOrder(Table table) throws IOException { + dataFilesShouldHaveSortOrderIdMatching(table, table.sortOrder()); + } + + private void dataFilesShouldHaveSortOrderIdMatching(Table table, SortOrder sortOrder) + throws IOException { + try (CloseableIterable files = table.newScan().planFiles()) { + assertThat(files) + .extracting(fileScanTask -> fileScanTask.file().sortOrderId()) + .containsOnly(sortOrder.orderId()); + } + } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 2ad84f41fd87..81cb54cceabc 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -27,6 +27,7 @@ import static org.mockito.Mockito.when; import java.io.File; +import java.io.IOException; import java.net.InetAddress; import java.nio.file.Path; import java.util.List; @@ -35,6 +36,7 @@ import org.apache.iceberg.AppendFiles; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.Parameter; @@ -43,10 +45,12 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.exceptions.CommitStateUnknownException; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -153,6 +157,7 @@ public void testBasicWrite() { assertThat(file.splitOffsets()).as("Split offsets not present").isNotNull(); } assertThat(file.recordCount()).as("Should have reported record count as 1").isEqualTo(1); + assertThat(file.sortOrderId()).isEqualTo(SortOrder.unsorted().orderId()); // TODO: append more metric info if (format.equals(FileFormat.PARQUET)) { assertThat(file.columnSizes()).as("Column sizes metric not present").isNotNull(); @@ -473,6 +478,116 @@ public void testViewsReturnRecentResults() { assertThat(actual2).hasSameSizeAs(expected2).isEqualTo(expected2); } + @TestTemplate + public void testWriteDataFilesInTableSortOrder() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").build(); + Table table = tables.create(SCHEMA, spec, sortOrder, ImmutableMap.of(), location.toString()); + + List expected = Lists.newArrayListWithCapacity(10); + for (int i = 0; i < 10; i++) { + expected.add(new SimpleRecord(i, "a")); + } + + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + Dataset result = spark.read().format("iceberg").load(location.toString()); + + List actual = + result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); + assertThat(actual).hasSameSizeAs(expected).isEqualTo(expected); + + try (CloseableIterable fileScanTasks = table.newScan().planFiles()) { + assertThat(fileScanTasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesUnsortedTable() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List expected = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(expected, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles should have unsorted sort order id") + .containsOnly(SortOrder.unsorted().orderId()); + } + } + + @TestTemplate + public void testWriteDataFilesAfterSortOrderChange() throws IOException { + File parent = temp.resolve(format.toString()).toFile(); + File location = new File(parent, "test"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.unpartitioned(); + Table table = tables.create(SCHEMA, spec, location.toString()); + + List records = Lists.newArrayList(new SimpleRecord(1, "a")); + Dataset df = spark.createDataFrame(records, SimpleRecord.class); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + int unsortedId = SortOrder.unsorted().orderId(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).extracting(task -> task.file().sortOrderId()).containsOnly(unsortedId); + } + + table.replaceSortOrder().asc("id").commit(); + int sortedId = table.sortOrder().orderId(); + + df.select("id", "data") + .write() + .format("iceberg") + .option(SparkWriteOptions.WRITE_FORMAT, format.toString()) + .mode(SaveMode.Append) + .save(location.toString()); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("Should contain both unsorted and sorted files") + .containsOnly(unsortedId, sortedId); + } + } + public void partitionedCreateWithTargetFileSizeViaOption(IcebergOptionsType option) { File parent = temp.resolve(format.toString()).toFile(); File location = new File(parent, "test"); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 54048bbf218a..2a178a74137c 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -29,10 +29,14 @@ import java.nio.file.Paths; import java.util.List; import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -263,6 +267,50 @@ public void testStreamingWriteCompleteModeWithProjection() throws Exception { } } + @Test + public void testStreamingWriteDataFilesInTableSortOrder() throws Exception { + File parent = temp.resolve("parquet").toFile(); + File location = new File(parent, "test-table"); + File checkpoint = new File(parent, "checkpoint"); + + HadoopTables tables = new HadoopTables(CONF); + PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); + SortOrder sortOrder = SortOrder.builderFor(SCHEMA).asc("id").build(); + Table table = tables.create(SCHEMA, spec, sortOrder, ImmutableMap.of(), location.toString()); + + MemoryStream inputStream = newMemoryStream(1, spark.sqlContext(), Encoders.INT()); + DataStreamWriter streamWriter = + inputStream + .toDF() + .selectExpr("value AS id", "CAST (value AS STRING) AS data") + .writeStream() + .outputMode("append") + .format("iceberg") + .option("checkpointLocation", checkpoint.toString()) + .option("path", location.toString()); + + try { + StreamingQuery query = streamWriter.start(); + List batch1 = Lists.newArrayList(1, 2); + send(batch1, inputStream); + query.processAllAvailable(); + query.stop(); + + table.refresh(); + + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks) + .extracting(task -> task.file().sortOrderId()) + .as("All DataFiles are written with the table sort order id") + .containsOnly(table.sortOrder().orderId()); + } + } finally { + for (StreamingQuery query : spark.streams().active()) { + query.stop(); + } + } + } + @Test public void testStreamingWriteUpdateMode() throws Exception { File parent = temp.resolve("parquet").toFile(); From 3550bcef7a6979e425f7eec7964e7fd0c07c9f7c Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Fri, 3 Apr 2026 16:14:19 +0200 Subject: [PATCH 014/197] Core: Upgrade Jetty to 12.1.5 (#10837) Co-authored-by: manuzhang --- .../aws/s3/signer/TestS3RestSigner.java | 5 +- .../iceberg/rest/TestBaseWithRESTServer.java | 5 +- .../apache/iceberg/rest/TestRESTCatalog.java | 14 ++++- .../iceberg/rest/TestRESTViewCatalog.java | 5 +- ...RESTViewCatalogWithAssumedViewSupport.java | 5 +- gradle/libs.versions.toml | 4 +- .../RESTCompatibilityKitCatalogTests.java | 9 +++ .../iceberg/rest/RESTCatalogServer.java | 5 +- spark/v3.4/build.gradle | 21 +------ .../iceberg/DeleteFileIndexBenchmark.java | 3 +- .../spark/MergeCardinalityCheckBenchmark.java | 2 +- .../iceberg/spark/PlanningBenchmark.java | 2 +- .../spark/TaskGroupPlanningBenchmark.java | 2 +- .../spark/UpdateProjectionBenchmark.java | 2 +- .../spark/extensions/ExtensionsTestBase.java | 1 + .../action/DeleteOrphanFilesBenchmark.java | 2 + .../IcebergSortCompactionBenchmark.java | 2 + .../spark/source/IcebergSourceBenchmark.java | 3 +- .../SparkDistributedDataScanTestBase.java | 2 + .../TestSparkDistributedDataScanDeletes.java | 2 + ...stSparkDistributedDataScanFilterFiles.java | 2 + ...TestSparkDistributedDataScanReporting.java | 2 + .../iceberg/spark/DummyMetricsServlet.java | 62 +++++++++++++++++++ .../org/apache/iceberg/spark/TestBase.java | 8 +++ .../iceberg/spark/source/ScanTestBase.java | 2 + .../spark/source/TestFilteredScan.java | 2 + .../source/TestForwardCompatibility.java | 2 + .../spark/source/TestIcebergSpark.java | 2 + .../spark/source/TestPartitionPruning.java | 2 + .../spark/source/TestPartitionValues.java | 2 + .../spark/source/TestSnapshotSelection.java | 2 + .../spark/source/TestSparkDataFile.java | 2 + .../spark/source/TestSparkDataWrite.java | 2 + .../spark/source/TestSparkReadProjection.java | 2 + .../spark/source/TestSparkReaderDeletes.java | 2 + .../TestSparkReaderWithBloomFilter.java | 2 + .../spark/source/TestStructuredStreaming.java | 2 + .../spark/source/TestWriteMetricsConfig.java | 2 + .../spark/sql/TestAggregatePushDown.java | 1 + spark/v3.5/build.gradle | 21 +------ .../iceberg/DeleteFileIndexBenchmark.java | 3 +- .../spark/MergeCardinalityCheckBenchmark.java | 2 +- .../iceberg/spark/PlanningBenchmark.java | 2 +- .../spark/TaskGroupPlanningBenchmark.java | 2 +- .../spark/UpdateProjectionBenchmark.java | 2 +- .../iceberg/spark/TestExtendedParser.java | 7 ++- .../spark/extensions/ExtensionsTestBase.java | 1 + .../action/DeleteOrphanFilesBenchmark.java | 2 + .../IcebergSortCompactionBenchmark.java | 2 + .../spark/source/DVReaderBenchmark.java | 3 +- .../spark/source/DVWriterBenchmark.java | 3 +- .../spark/source/IcebergSourceBenchmark.java | 3 +- .../SparkDistributedDataScanTestBase.java | 2 + .../TestSparkDistributedDataScanDeletes.java | 2 + ...stSparkDistributedDataScanFilterFiles.java | 2 + ...TestSparkDistributedDataScanReporting.java | 2 + .../iceberg/spark/DummyMetricsServlet.java | 62 +++++++++++++++++++ .../org/apache/iceberg/spark/TestBase.java | 8 +++ ...rquetDictionaryEncodedVectorizedReads.java | 2 + .../iceberg/spark/source/ScanTestBase.java | 2 + .../spark/source/TestFilteredScan.java | 2 + .../source/TestForwardCompatibility.java | 2 + .../spark/source/TestIcebergSpark.java | 2 + .../spark/source/TestPartitionPruning.java | 2 + .../spark/source/TestPartitionValues.java | 2 + .../spark/source/TestSnapshotSelection.java | 2 + .../spark/source/TestSparkDataFile.java | 2 + .../spark/source/TestSparkDataWrite.java | 2 + .../spark/source/TestSparkReadProjection.java | 2 + .../spark/source/TestSparkReaderDeletes.java | 2 + .../TestSparkReaderWithBloomFilter.java | 2 + .../spark/source/TestStructuredStreaming.java | 2 + .../spark/source/TestWriteMetricsConfig.java | 2 + .../spark/sql/TestAggregatePushDown.java | 1 + spark/v4.0/build.gradle | 21 +------ .../iceberg/DeleteFileIndexBenchmark.java | 3 +- .../spark/MergeCardinalityCheckBenchmark.java | 2 +- .../iceberg/spark/PlanningBenchmark.java | 2 +- .../spark/TaskGroupPlanningBenchmark.java | 2 +- .../spark/UpdateProjectionBenchmark.java | 2 +- .../iceberg/spark/TestExtendedParser.java | 7 ++- .../spark/extensions/ExtensionsTestBase.java | 1 + .../action/DeleteOrphanFilesBenchmark.java | 2 + .../IcebergSortCompactionBenchmark.java | 2 + .../spark/source/DVReaderBenchmark.java | 3 +- .../spark/source/DVWriterBenchmark.java | 3 +- .../spark/source/IcebergSourceBenchmark.java | 3 +- .../SparkDistributedDataScanTestBase.java | 2 + .../TestSparkDistributedDataScanDeletes.java | 2 + ...stSparkDistributedDataScanFilterFiles.java | 2 + ...TestSparkDistributedDataScanReporting.java | 2 + .../iceberg/spark/DummyMetricsServlet.java | 62 +++++++++++++++++++ .../org/apache/iceberg/spark/TestBase.java | 8 +++ ...rquetDictionaryEncodedVectorizedReads.java | 2 + .../iceberg/spark/source/ScanTestBase.java | 2 + .../spark/source/TestFilteredScan.java | 2 + .../source/TestForwardCompatibility.java | 2 + .../spark/source/TestIcebergSpark.java | 2 + .../spark/source/TestPartitionPruning.java | 2 + .../spark/source/TestPartitionValues.java | 2 + .../spark/source/TestSnapshotSelection.java | 2 + .../spark/source/TestSparkDataFile.java | 2 + .../spark/source/TestSparkDataWrite.java | 2 + .../spark/source/TestSparkReadProjection.java | 2 + .../spark/source/TestSparkReaderDeletes.java | 2 + .../TestSparkReaderWithBloomFilter.java | 2 + .../spark/source/TestStructuredStreaming.java | 2 + .../spark/source/TestWriteMetricsConfig.java | 2 + .../spark/sql/TestAggregatePushDown.java | 1 + spark/v4.1/build.gradle | 21 +------ .../iceberg/DeleteFileIndexBenchmark.java | 3 +- .../spark/MergeCardinalityCheckBenchmark.java | 2 +- .../iceberg/spark/PlanningBenchmark.java | 2 +- .../spark/TaskGroupPlanningBenchmark.java | 2 +- .../spark/UpdateProjectionBenchmark.java | 2 +- .../iceberg/spark/TestExtendedParser.java | 7 ++- .../spark/extensions/ExtensionsTestBase.java | 1 + .../TestPartitionedWritesToWapBranch.java | 1 + .../action/DeleteOrphanFilesBenchmark.java | 2 + .../IcebergSortCompactionBenchmark.java | 2 + .../spark/source/DVReaderBenchmark.java | 3 +- .../spark/source/DVWriterBenchmark.java | 3 +- .../spark/source/IcebergSourceBenchmark.java | 3 +- .../SparkDistributedDataScanTestBase.java | 2 + .../TestSparkDistributedDataScanDeletes.java | 2 + ...stSparkDistributedDataScanFilterFiles.java | 2 + ...TestSparkDistributedDataScanReporting.java | 2 + .../iceberg/spark/DummyMetricsServlet.java | 62 +++++++++++++++++++ .../org/apache/iceberg/spark/TestBase.java | 9 +++ ...rquetDictionaryEncodedVectorizedReads.java | 2 + .../iceberg/spark/source/ScanTestBase.java | 2 + .../spark/source/TestFilteredScan.java | 2 + .../source/TestForwardCompatibility.java | 2 + .../spark/source/TestIcebergSpark.java | 2 + .../spark/source/TestPartitionPruning.java | 2 + .../spark/source/TestPartitionValues.java | 2 + .../spark/source/TestSnapshotSelection.java | 2 + .../spark/source/TestSparkDataFile.java | 2 + .../spark/source/TestSparkDataWrite.java | 2 + .../spark/source/TestSparkReadProjection.java | 2 + .../spark/source/TestSparkReaderDeletes.java | 2 + .../TestSparkReaderWithBloomFilter.java | 2 + .../spark/source/TestStructuredStreaming.java | 2 + .../spark/source/TestWriteMetricsConfig.java | 2 + .../spark/sql/TestAggregatePushDown.java | 1 + 145 files changed, 568 insertions(+), 119 deletions(-) create mode 100644 spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java create mode 100644 spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java create mode 100644 spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java create mode 100644 spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java index b51d97cc611a..f09360915725 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java @@ -37,10 +37,10 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.auth.OAuth2Properties; import org.apache.iceberg.util.ThreadPools; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -181,6 +181,7 @@ public void before() throws Exception { CreateMultipartUploadRequest.builder().bucket(BUCKET).key("random/multipart-key").build()); } + @SuppressWarnings("removal") private static Server initHttpServer() throws Exception { S3SignerServlet.SignRequestValidator deleteObjectsWithBody = new S3SignerServlet.SignRequestValidator( diff --git a/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java b/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java index a79977c2464e..c386ecf60f67 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java @@ -34,10 +34,10 @@ import org.apache.iceberg.inmemory.InMemoryCatalog; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.responses.ErrorResponse; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.io.TempDir; @@ -61,6 +61,7 @@ public abstract class TestBaseWithRESTServer { @TempDir private Path temp; + @SuppressWarnings("removal") @BeforeEach public void before() throws Exception { File warehouse = temp.toFile(); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java index 571b8002389f..f6050d133313 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java @@ -120,10 +120,10 @@ import org.apache.iceberg.util.Pair; import org.assertj.core.api.InstanceOfAssertFactories; import org.awaitility.Awaitility; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -271,6 +271,7 @@ protected T execute( private Server httpServer; private HeaderValidatingAdapter adapterForRESTServer; + @SuppressWarnings("removal") @BeforeEach public void createCatalog() throws Exception { File warehouse = temp.toFile(); @@ -416,6 +417,15 @@ protected boolean requiresNamespaceCreate() { return true; } + @Override + protected boolean supportsNamesWithSlashes() { + // names with slashes are rejected and considered as suspicious characters after upgrading Jetty + // and the Servlet API. See also + // https://jakarta.ee/specifications/servlet/6.0/jakarta-servlet-spec-6.0.html#uri-path-canonicalization + // for additional details + return false; + } + /* RESTCatalog specific tests */ @Test diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java index fd2faf55087c..f02ab2b9bbd4 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java @@ -57,10 +57,10 @@ import org.apache.iceberg.rest.responses.LoadViewResponse; import org.apache.iceberg.view.ViewCatalogTests; import org.apache.iceberg.view.ViewMetadata; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -78,6 +78,7 @@ public class TestRESTViewCatalog extends ViewCatalogTests { protected InMemoryCatalog backendCatalog; protected Server httpServer; + @SuppressWarnings("removal") @BeforeEach public void createCatalog() throws Exception { File warehouse = temp.toFile(); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java index 1ba340cc56c2..fa999e803325 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java @@ -31,14 +31,15 @@ import org.apache.iceberg.inmemory.InMemoryCatalog; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.responses.ConfigResponse; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.junit.jupiter.api.BeforeEach; public class TestRESTViewCatalogWithAssumedViewSupport extends TestRESTViewCatalog { + @SuppressWarnings("removal") @BeforeEach public void createCatalog() throws Exception { File warehouse = temp.toFile(); diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 97bdd68a7831..3a2f277e4568 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -65,7 +65,7 @@ jakarta-el-api = "3.0.3" jakarta-servlet-api = "6.1.0" jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" -jetty = "11.0.26" +jetty = "12.1.5" junit = "5.14.3" junit-platform = "1.14.3" junit-pioneer = "2.3.0" @@ -203,7 +203,7 @@ guava-testlib = { module = "com.google.guava:guava-testlib", version.ref = "guav jakarta-el-api = { module = "jakarta.el:jakarta.el-api", version.ref = "jakarta-el-api" } jakarta-servlet = {module = "jakarta.servlet:jakarta.servlet-api", version.ref = "jakarta-servlet-api"} jetty-server = { module = "org.eclipse.jetty:jetty-server", version.ref = "jetty" } -jetty-servlet = { module = "org.eclipse.jetty:jetty-servlet", version.ref = "jetty" } +jetty-servlet = { module = "org.eclipse.jetty.ee10:jetty-ee10-servlet", version.ref = "jetty" } junit-jupiter = { module = "org.junit.jupiter:junit-jupiter", version.ref = "junit" } junit-jupiter-engine = { module = "org.junit.jupiter:junit-jupiter-engine", version.ref = "junit" } junit-pioneer = { module = "org.junit-pioneer:junit-pioneer", version.ref = "junit-pioneer" } diff --git a/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java b/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java index 87ec90663db2..9a1f86706db6 100644 --- a/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java +++ b/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java @@ -97,4 +97,13 @@ protected boolean supportsNamesWithDot() { return PropertyUtil.propertyAsBoolean( restCatalog.properties(), RESTCompatibilityKitSuite.RCK_SUPPORTS_NAMES_WITH_DOT, false); } + + @Override + protected boolean supportsNamesWithSlashes() { + // names with slashes are rejected and considered as suspicious characters after upgrading Jetty + // and the Servlet API. See also + // https://jakarta.ee/specifications/servlet/6.0/jakarta-servlet-spec-6.0.html#uri-path-canonicalization + // for additional details + return false; + } } diff --git a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java index 5f0f89d92646..34d8761a902b 100644 --- a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java +++ b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java @@ -28,12 +28,12 @@ import org.apache.iceberg.jdbc.JdbcCatalog; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.PropertyUtil; +import org.eclipse.jetty.ee10.servlet.ServletContextHandler; +import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Connector; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.ServerConnector; import org.eclipse.jetty.server.handler.gzip.GzipHandler; -import org.eclipse.jetty.servlet.ServletContextHandler; -import org.eclipse.jetty.servlet.ServletHolder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -106,6 +106,7 @@ private CatalogContext initializeBackendCatalog() throws IOException { catalogProperties); } + @SuppressWarnings("removal") public void start(boolean join) throws Exception { CatalogContext catalogContext = initializeBackendCatalog(); diff --git a/spark/v3.4/build.gradle b/spark/v3.4/build.gradle index bb8270e3d303..ead4a32f49b0 100644 --- a/spark/v3.4/build.gradle +++ b/spark/v3.4/build.gradle @@ -105,13 +105,8 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet } test { @@ -172,13 +167,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -269,11 +258,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index f48e39e500c0..86f3f19de937 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -157,7 +158,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index dc625d240769..97e6b86dabce 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index 05aa9602a323..52884bf10308 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -157,7 +157,7 @@ public void localPlanningWithoutFilterWithStats(Blackhole blackhole) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index 2b85a8e385ec..5e39596f6ac6 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -242,7 +242,7 @@ private Dataset randomDataDF(Schema schema, int numRows) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v3.4/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 4c1a5095916c..834640e24328 100644 --- a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -59,6 +59,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index 68406a20e725..317bd96e7df1 100644 --- a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 2ac7c26992e3..073e8c9327df 100644 --- a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -393,6 +394,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.4/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284606..9b08d6f7ab1e 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -90,6 +91,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 9361c63176e0..8eeb55171dbe 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -69,6 +70,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index a218f965ea65..eae640528f9e 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -23,6 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -62,6 +63,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index acd4688440d1..6ad0907fffed 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -59,6 +60,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index 3c32b4693684..b89109174d90 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,7 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index 6647a1b483e0..c368c4a815fe 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -37,6 +37,7 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.AvroDataTestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; @@ -62,6 +63,7 @@ public static void startSpark() { SparkSession.builder() .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .master("local[2]") + .config(TestBase.DISABLE_UI) .getOrCreate(); ScanTestBase.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index 308b1bd2c646..cfc38ed66fac 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -58,6 +58,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; @@ -125,6 +126,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); // define UDFs used by partition tests diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index c03f7b94eca9..dcd9c2897e08 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -46,6 +46,7 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -99,6 +100,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index f4f57157e479..a637b975fe2b 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -28,6 +28,7 @@ import java.sql.Timestamp; import java.util.List; import org.apache.iceberg.spark.IcebergSpark; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Row; @@ -51,6 +52,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index becb9dcb4aca..cf3097ebdb30 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -59,6 +59,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.api.java.JavaRDD; @@ -118,6 +119,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestPartitionPruning.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index b0ad930487b1..82575a720236 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -46,6 +46,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -112,6 +113,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 11865db7fce5..fe754f4a02ba 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -91,6 +92,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index 3c00835da382..f56333649261 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -59,6 +59,7 @@ import org.apache.iceberg.spark.SparkDataFile; import org.apache.iceberg.spark.SparkDeleteFile; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; @@ -125,6 +126,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestSparkDataFile.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 1957f258e1ed..439c4443b990 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -51,6 +51,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.SnapshotUtil; import org.apache.spark.sql.Dataset; @@ -99,6 +100,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index 8ccea303d0c1..de6a5e59029c 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -50,6 +50,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -88,6 +89,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); ImmutableMap config = ImmutableMap.of( diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index eff032743e3b..33b5a1d6e600 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -71,6 +71,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; import org.apache.iceberg.spark.source.metrics.NumDeletes; @@ -132,6 +133,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index dc4fc7e187fb..a974b58a9714 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -34,6 +34,7 @@ import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -69,6 +70,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index c3fac70dd3fc..45ff9184566b 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -41,6 +41,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -84,6 +85,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index 5ce56b4feca7..946456fe2be8 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index 18fca51be251..a69b78e5ad8e 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle @@ -105,14 +105,9 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet } test { @@ -173,13 +168,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -271,11 +260,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index 8b0b05911f66..242ef7439a39 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -205,7 +206,7 @@ private void initDataAndDVs() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index d7f285288004..03e0410c0adc 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index 1d51350487c4..5cd8143f17bf 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -215,7 +215,7 @@ public void localPlanningViaDistributedScanWithoutFilterWithStats(Blackhole blac private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.driver.maxResultSize", "8G") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index ad78205ce98c..a77c130ee17a 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -199,7 +199,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v3.5/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java index bfcb5af235d3..ef4f0090292c 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java @@ -49,7 +49,12 @@ public class TestExtendedParser { @BeforeAll public static void before() { - spark = SparkSession.builder().master("local").appName("TestExtendedParser").getOrCreate(); + spark = + SparkSession.builder() + .master("local") + .appName("TestExtendedParser") + .config(TestBase.DISABLE_UI) + .getOrCreate(); } @AfterAll diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 4c1a5095916c..834640e24328 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -59,6 +59,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index 64edb1002e99..47fe46558d7e 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 4978961be641..683f6bb46d05 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -394,6 +395,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java index c6794e43c636..3f242ce228ca 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java @@ -49,6 +49,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.util.ContentFileUtil; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; @@ -234,7 +235,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java index ac74fb5a109c..db5789724056 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; @@ -218,7 +219,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v3.5/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284606..9b08d6f7ab1e 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -90,6 +91,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 659507e4c5e3..e28603c0b43a 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -73,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index a218f965ea65..2967f0e22cec 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -23,6 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -61,6 +62,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + .config(TestBase.DISABLE_UI) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index 2665d7ba8d3b..4f789d2c5ae9 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -63,6 +64,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index daf4e29ac075..5e7e1a1f6193 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,7 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java index 284fa0b0552f..b61ecfa2f442 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; import org.apache.iceberg.types.Types; @@ -65,6 +66,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index 1c5905744a75..39ea25ae6f54 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -37,6 +37,7 @@ import org.apache.iceberg.TableProperties; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.AvroDataTestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; @@ -62,6 +63,7 @@ public static void startSpark() { SparkSession.builder() .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .master("local[2]") + .config(TestBase.DISABLE_UI) .getOrCreate(); ScanTestBase.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index 61d6501a6847..26c2b6ab70cb 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -54,6 +54,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -116,6 +117,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 153564f7d129..0ba3f0d35fd7 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -46,6 +46,7 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -98,6 +99,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index f4f57157e479..a637b975fe2b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -28,6 +28,7 @@ import java.sql.Timestamp; import java.util.List; import org.apache.iceberg.spark.IcebergSpark; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Row; @@ -51,6 +52,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index becb9dcb4aca..cf3097ebdb30 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -59,6 +59,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.api.java.JavaRDD; @@ -118,6 +119,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestPartitionPruning.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index 0b6ab2052b66..9b5b22a73f36 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -46,6 +46,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -112,6 +113,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 11865db7fce5..fe754f4a02ba 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -91,6 +92,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index 600b3eab1d68..2122f2579e4a 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -59,6 +59,7 @@ import org.apache.iceberg.spark.SparkDataFile; import org.apache.iceberg.spark.SparkDeleteFile; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; @@ -125,6 +126,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestSparkDataFile.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 81cb54cceabc..70f3b986d23b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -55,6 +55,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.SnapshotUtil; import org.apache.spark.sql.Dataset; @@ -103,6 +104,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index 8ccea303d0c1..de6a5e59029c 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -50,6 +50,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -88,6 +89,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); ImmutableMap config = ImmutableMap.of( diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index 7dd4c6f7cf76..2fb6933ce933 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -75,6 +75,7 @@ import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; import org.apache.iceberg.spark.source.metrics.NumDeletes; @@ -137,6 +138,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 2a178a74137c..ab760010535b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -38,6 +38,7 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -73,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index c3fac70dd3fc..45ff9184566b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -41,6 +41,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -84,6 +85,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index 5ce56b4feca7..946456fe2be8 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/build.gradle b/spark/v4.0/build.gradle index 62111e104e26..ba2e0fd4bae2 100644 --- a/spark/v4.0/build.gradle +++ b/spark/v4.0/build.gradle @@ -105,14 +105,9 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet } test { @@ -173,13 +168,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -271,11 +260,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index 9375ca3a4f46..5287ccd514ab 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -205,7 +206,7 @@ private void initDataAndDVs() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index 963daa2c364c..ea31b98f1ac9 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index 34d9d70e6ccb..f9558240f8cb 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -215,7 +215,7 @@ public void localPlanningViaDistributedScanWithoutFilterWithStats(Blackhole blac private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.driver.maxResultSize", "8G") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index 7c2def237874..e9c563b9b0ef 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -199,7 +199,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v4.0/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java index bfcb5af235d3..ef4f0090292c 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java @@ -49,7 +49,12 @@ public class TestExtendedParser { @BeforeAll public static void before() { - spark = SparkSession.builder().master("local").appName("TestExtendedParser").getOrCreate(); + spark = + SparkSession.builder() + .master("local") + .appName("TestExtendedParser") + .config(TestBase.DISABLE_UI) + .getOrCreate(); } @AfterAll diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 796c47b545cc..f23a5d9db3ad 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -59,6 +59,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index e1d9ac18dac1..ad4c0f3e67e4 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 4978961be641..683f6bb46d05 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -394,6 +395,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java index c6794e43c636..3f242ce228ca 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java @@ -49,6 +49,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.util.ContentFileUtil; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; @@ -234,7 +235,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java index ac74fb5a109c..db5789724056 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; @@ -218,7 +219,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v4.0/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index 404ba7284606..9b08d6f7ab1e 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -90,6 +91,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 659507e4c5e3..e28603c0b43a 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -73,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index a218f965ea65..eae640528f9e 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -23,6 +23,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -62,6 +63,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index 2665d7ba8d3b..4f789d2c5ae9 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -63,6 +64,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index daf4e29ac075..5e7e1a1f6193 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,7 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java index 284fa0b0552f..b61ecfa2f442 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; import org.apache.iceberg.types.Types; @@ -65,6 +66,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index 33c842d94be1..da9cd639218f 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -38,6 +38,7 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.AvroDataTestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.types.TypeUtil; @@ -62,6 +63,7 @@ public static void startSpark() { SparkSession.builder() .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .master("local[2]") + .config(TestBase.DISABLE_UI) .getOrCreate(); ScanTestBase.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index 61d6501a6847..26c2b6ab70cb 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -54,6 +54,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -116,6 +117,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index c4e0d26c1c31..d7d8756a43b4 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -46,6 +46,7 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -98,6 +99,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index f4f57157e479..a637b975fe2b 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -28,6 +28,7 @@ import java.sql.Timestamp; import java.util.List; import org.apache.iceberg.spark.IcebergSpark; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Row; @@ -51,6 +52,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index 7e2b140550e5..8098db81f999 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -60,6 +60,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.api.java.JavaRDD; @@ -119,6 +120,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestPartitionPruning.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index 0b6ab2052b66..9b5b22a73f36 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -46,6 +46,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -112,6 +113,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 11865db7fce5..fe754f4a02ba 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -91,6 +92,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index ef87e8f296ee..d719ca6751a0 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -60,6 +60,7 @@ import org.apache.iceberg.spark.SparkDataFile; import org.apache.iceberg.spark.SparkDeleteFile; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; @@ -126,6 +127,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestSparkDataFile.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index 81cb54cceabc..70f3b986d23b 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -55,6 +55,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.SnapshotUtil; import org.apache.spark.sql.Dataset; @@ -103,6 +104,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index 8ccea303d0c1..de6a5e59029c 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -50,6 +50,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -88,6 +89,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); ImmutableMap config = ImmutableMap.of( diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index 7dd4c6f7cf76..2fb6933ce933 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -75,6 +75,7 @@ import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; import org.apache.iceberg.spark.source.metrics.NumDeletes; @@ -137,6 +138,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 2a178a74137c..ab760010535b 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -38,6 +38,7 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -73,6 +74,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index e2b5d8920e9f..ab2479d61058 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -85,6 +86,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index ce0a0f26a096..e1d2b19f890c 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/build.gradle b/spark/v4.1/build.gradle index 355a85ab81a9..02e4323e709e 100644 --- a/spark/v4.1/build.gradle +++ b/spark/v4.1/build.gradle @@ -105,14 +105,9 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - testImplementation libs.sqlite.jdbc + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet } test { @@ -173,13 +168,7 @@ project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVer testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-hive-metastore', configuration: 'testArtifacts') testImplementation project(path: ":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}", configuration: 'testArtifacts') - testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - // runtime dependencies for running REST Catalog based integration test - testRuntimeOnly libs.jetty.servlet - testRuntimeOnly libs.sqlite.jdbc - + testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.avro.avro testImplementation libs.parquet.hadoop testImplementation libs.awaitility @@ -271,11 +260,7 @@ project(":iceberg-spark:iceberg-spark-runtime-${sparkMajorVersion}_${scalaVersio integrationRuntimeOnly project(':iceberg-hive-metastore') // runtime dependencies for running REST Catalog based integration test integrationRuntimeOnly project(path: ':iceberg-core', configuration: 'testArtifacts') - integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) { - transitive = false - } - integrationRuntimeOnly libs.jetty.servlet - integrationRuntimeOnly libs.sqlite.jdbc + integrationRuntimeOnly (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) // Not allowed on our classpath, only the runtime jar is allowed integrationCompileOnly project(":iceberg-spark:iceberg-spark-extensions-${sparkMajorVersion}_${scalaVersion}") diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java index a8b226ea1e37..a468a1cc8717 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/DeleteFileIndexBenchmark.java @@ -31,6 +31,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions; import org.apache.iceberg.util.ThreadPools; import org.apache.spark.sql.SparkSession; @@ -205,7 +206,7 @@ private void initDataAndDVs() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java index eeea81634596..bc34bf33e35e 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/MergeCardinalityCheckBenchmark.java @@ -155,7 +155,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java index c50a3fd406d7..0df55de933cf 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/PlanningBenchmark.java @@ -215,7 +215,7 @@ public void localPlanningViaDistributedScanWithoutFilterWithStats(Blackhole blac private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.driver.maxResultSize", "8G") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java index 8a8097834ef8..fd3eab4d9df6 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/TaskGroupPlanningBenchmark.java @@ -198,7 +198,7 @@ private void initDataAndDeletes() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) diff --git a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java index d917eae5eb0f..caa23625fc44 100644 --- a/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java +++ b/spark/v4.1/spark-extensions/src/jmh/java/org/apache/iceberg/spark/UpdateProjectionBenchmark.java @@ -138,7 +138,7 @@ private void runBenchmark(RowLevelOperationMode mode, double updatePercentage) { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.sql.extensions", IcebergSparkSessionExtensions.class.getName()) .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java index bfcb5af235d3..ef4f0090292c 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/TestExtendedParser.java @@ -49,7 +49,12 @@ public class TestExtendedParser { @BeforeAll public static void before() { - spark = SparkSession.builder().master("local").appName("TestExtendedParser").getOrCreate(); + spark = + SparkSession.builder() + .master("local") + .appName("TestExtendedParser") + .config(TestBase.DISABLE_UI) + .getOrCreate(); } @AfterAll diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java index 6de4e0d6461e..f766fbb79aff 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/ExtensionsTestBase.java @@ -71,6 +71,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") .config( SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), String.valueOf(RANDOM.nextBoolean())) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java index 1db18f3a857d..af065451ab69 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestPartitionedWritesToWapBranch.java @@ -69,6 +69,7 @@ public static void startMetastoreAndSpark() { .config("spark.sql.shuffle.partitions", "4") .config("spark.sql.hive.metastorePartitionPruningFallbackOnException", "true") .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java index 231bb7c619f4..3fd84553f033 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/DeleteOrphanFilesBenchmark.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -179,6 +180,7 @@ private void setupSpark() { .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", catalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local"); spark = builder.getOrCreate(); } diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java index 4978961be641..683f6bb46d05 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/action/IcebergSortCompactionBenchmark.java @@ -41,6 +41,7 @@ import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.actions.SparkActions; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -394,6 +395,7 @@ protected void setupSpark() { "spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") .config("spark.sql.catalog.spark_catalog.type", "hadoop") .config("spark.sql.catalog.spark_catalog.warehouse", getCatalogWarehouse()) + .config(TestBase.DISABLE_UI) .master("local[*]"); spark = builder.getOrCreate(); Configuration sparkHadoopConf = spark.sessionState().newHadoopConf(); diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java index c6794e43c636..3f242ce228ca 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVReaderBenchmark.java @@ -49,6 +49,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.util.ContentFileUtil; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; @@ -234,7 +235,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java index ac74fb5a109c..db5789724056 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/DVWriterBenchmark.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkSessionCatalog; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; @@ -218,7 +219,7 @@ private String generateDataFilePath() { private void setupSpark() { this.spark = SparkSession.builder() - .config("spark.ui.enabled", false) + .config(TestBase.DISABLE_UI) .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config("spark.sql.catalog.spark_catalog", SparkSessionCatalog.class.getName()) .config("spark.sql.catalog.spark_catalog.type", "hadoop") diff --git a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java index 68c537e34a4a..debe37866ff7 100644 --- a/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java +++ b/spark/v4.1/spark/src/jmh/java/org/apache/iceberg/spark/source/IcebergSourceBenchmark.java @@ -30,6 +30,7 @@ import org.apache.iceberg.UpdateProperties; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; @@ -94,7 +95,7 @@ protected void cleanupFiles() throws IOException { } protected void setupSpark(boolean enableDictionaryEncoding) { - SparkSession.Builder builder = SparkSession.builder().config("spark.ui.enabled", false); + SparkSession.Builder builder = SparkSession.builder().config(TestBase.DISABLE_UI); if (!enableDictionaryEncoding) { builder .config("parquet.dictionary.page.size", "1") diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java index aa4f3dc72416..d1c724425c9f 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/SparkDistributedDataScanTestBase.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.List; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.BeforeEach; @@ -89,6 +90,7 @@ protected static SparkSession initSpark(String serializer) { .master("local[2]") .config("spark.serializer", serializer) .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java index 6ffaede5b069..a21c6a08ec3b 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanDeletes.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.List; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -72,6 +73,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java index 1e680ace292f..5edf4828229a 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanFilterFiles.java @@ -22,6 +22,7 @@ import static org.apache.iceberg.PlanningMode.LOCAL; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -61,6 +62,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java index 9b736004de57..e6f3c75475d8 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/TestSparkDistributedDataScanReporting.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.List; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.TestBase; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.internal.SQLConf; import org.junit.jupiter.api.AfterAll; @@ -62,6 +63,7 @@ public static void startSpark() { .master("local[2]") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .config(SQLConf.SHUFFLE_PARTITIONS().key(), "4") + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java new file mode 100644 index 000000000000..ee1f29e56fb3 --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/DummyMetricsServlet.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark; + +import com.codahale.metrics.MetricRegistry; +import java.util.Properties; +import org.apache.spark.SparkConf; +import org.apache.spark.metrics.sink.MetricsServlet; +import org.sparkproject.jetty.servlet.ServletContextHandler; + +/** + * A dummy implementation of {@link MetricsServlet} that does not start a server or report metrics. + * This is used in tests to avoid conflicts with Spark's jetty dependencies. + */ +public class DummyMetricsServlet extends MetricsServlet { + + /** + * Constructor required by Spark's reflection-based instantiation. + * + * @param properties Metrics properties + * @param registry Metric registry + */ + public DummyMetricsServlet(Properties properties, MetricRegistry registry) { + super(properties, registry); + } + + @Override + public ServletContextHandler[] getHandlers(SparkConf conf) { + return new ServletContextHandler[] {}; + } + + @Override + public void start() { + // No-op for tests + } + + @Override + public void stop() { + // No-op for tests + } + + @Override + public void report() { + // No-op for tests + } +} diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java index daf4e29ac075..507d7b313b42 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestBase.java @@ -65,6 +65,13 @@ public abstract class TestBase extends SparkTestHelperBase { protected static SparkSession spark = null; protected static JavaSparkContext sparkContext = null; protected static HiveCatalog catalog = null; + // disable Spark UI and use dummy servlet to avoid dependency conflicts with Spark's Jetty version + public static final Map DISABLE_UI = + ImmutableMap.of( + "spark.ui.enabled", + "false", + "spark.metrics.conf.*.sink.servlet.class", + "org.apache.iceberg.spark.DummyMetricsServlet"); @BeforeAll public static void startMetastoreAndSpark() { @@ -79,6 +86,8 @@ public static void startMetastoreAndSpark() { .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config("spark.ui.enabled", "false") + .config(DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java index 284fa0b0552f..b61ecfa2f442 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/data/vectorized/parquet/TestParquetDictionaryEncodedVectorizedReads.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; import org.apache.iceberg.types.Types; @@ -65,6 +66,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index 33c842d94be1..da9cd639218f 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -38,6 +38,7 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.hadoop.HadoopTables; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.AvroDataTestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.types.TypeUtil; @@ -62,6 +63,7 @@ public static void startSpark() { SparkSession.builder() .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .master("local[2]") + .config(TestBase.DISABLE_UI) .getOrCreate(); ScanTestBase.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java index 16988d1e28e9..24fecf4eb2ca 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestFilteredScan.java @@ -54,6 +54,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.GenericsHelpers; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; @@ -116,6 +117,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java index 0eb546bc5661..d0103ff46ead 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestForwardCompatibility.java @@ -46,6 +46,7 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -98,6 +99,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java index f4f57157e479..a637b975fe2b 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestIcebergSpark.java @@ -28,6 +28,7 @@ import java.sql.Timestamp; import java.util.List; import org.apache.iceberg.spark.IcebergSpark; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Row; @@ -51,6 +52,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java index 7e2b140550e5..8098db81f999 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionPruning.java @@ -60,6 +60,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.transforms.Transforms; import org.apache.iceberg.types.Types; import org.apache.spark.api.java.JavaRDD; @@ -119,6 +120,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestPartitionPruning.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java index 0b6ab2052b66..9b5b22a73f36 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestPartitionValues.java @@ -46,6 +46,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.TestHelpers; import org.apache.iceberg.types.Types; @@ -112,6 +113,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java index 416f57e5142a..3004e8fa5c5e 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java @@ -44,6 +44,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.AnalysisException; import org.apache.spark.sql.Dataset; @@ -97,6 +98,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java index ef87e8f296ee..d719ca6751a0 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataFile.java @@ -60,6 +60,7 @@ import org.apache.iceberg.spark.SparkDataFile; import org.apache.iceberg.spark.SparkDeleteFile; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types; @@ -126,6 +127,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestSparkDataFile.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java index d69bb31fd444..c2f5afef0e8c 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkDataWrite.java @@ -55,6 +55,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.SnapshotUtil; import org.apache.spark.sql.Dataset; @@ -103,6 +104,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java index 8ccea303d0c1..de6a5e59029c 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReadProjection.java @@ -50,6 +50,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; @@ -88,6 +89,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); ImmutableMap config = ImmutableMap.of( diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java index a5bf39a5a653..0d619305716e 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderDeletes.java @@ -75,6 +75,7 @@ import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkStructLike; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.spark.data.SparkParquetWriters; import org.apache.iceberg.spark.source.metrics.NumDeletes; @@ -137,6 +138,7 @@ public static void startMetastoreAndSpark() { .config("spark.ui.liveUpdate.period", 0) .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java index d22ecb02d483..cb2f866fab10 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java @@ -64,6 +64,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkValueConverter; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PropertyUtil; import org.apache.spark.sql.Dataset; @@ -182,6 +183,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java index 8d191cf30b14..5e900ea0bad4 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreaming.java @@ -38,6 +38,7 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.types.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; @@ -72,6 +73,7 @@ public static void startSpark() { .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) .config("spark.sql.shuffle.partitions", 4) + .config(TestBase.DISABLE_UI) .getOrCreate(); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java index e2b5d8920e9f..ab2479d61058 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestWriteMetricsConfig.java @@ -42,6 +42,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.SparkWriteOptions; +import org.apache.iceberg.spark.TestBase; import org.apache.iceberg.spark.data.RandomData; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -85,6 +86,7 @@ public static void startSpark() { SparkSession.builder() .master("local[2]") .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(TestBase.DISABLE_UI) .getOrCreate(); TestWriteMetricsConfig.sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java index 4baaf2d1fbb5..6eac5474afde 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java @@ -63,6 +63,7 @@ public static void startMetastoreAndSpark() { SparkSession.builder() .master("local[2]") .config("spark.sql.iceberg.aggregate_pushdown", "true") + .config(TestBase.DISABLE_UI) .enableHiveSupport() .getOrCreate(); From 9a939d68358de9dac2c6ba9b236b675ebe477490 Mon Sep 17 00:00:00 2001 From: Maksim Konstantinov Date: Fri, 3 Apr 2026 07:15:34 -0700 Subject: [PATCH 015/197] Build: bump shadow-gradle-plugin to 9.4.1 (#15835) --- build.gradle | 2 +- deploy.gradle | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.gradle b/build.gradle index 1ff423618378..95163d9d614c 100644 --- a/build.gradle +++ b/build.gradle @@ -26,7 +26,7 @@ buildscript { gradlePluginPortal() } dependencies { - classpath 'com.gradleup.shadow:shadow-gradle-plugin:8.3.10' + classpath 'com.gradleup.shadow:shadow-gradle-plugin:9.4.1' classpath 'com.palantir.baseline:gradle-baseline-java:6.90.0' classpath 'com.diffplug.spotless:spotless-plugin-gradle:8.4.0' classpath 'gradle.plugin.org.inferred:gradle-processors:3.7.0' diff --git a/deploy.gradle b/deploy.gradle index 740d0056273b..e1f26ec3e416 100644 --- a/deploy.gradle +++ b/deploy.gradle @@ -80,7 +80,7 @@ subprojects { if (tasks.matching({task -> task.name == 'shadowJar'}).isEmpty()) { from components.java } else { - project.shadow.component(it) + from components.shadow } artifact sourceJar From 775f55b50d829b0a11f4c850790c94d42b694fd0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Apr 2026 21:49:13 -0700 Subject: [PATCH 016/197] Build: Bump mkdocs-redirects from 1.2.2 to 1.2.3 (#15885) Bumps [mkdocs-redirects](https://github.com/ProperDocs/properdocs-redirects) from 1.2.2 to 1.2.3. - [Release notes](https://github.com/ProperDocs/properdocs-redirects/releases) - [Commits](https://github.com/ProperDocs/properdocs-redirects/compare/v1.2.2...v1.2.3) --- updated-dependencies: - dependency-name: mkdocs-redirects dependency-version: 1.2.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- site/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/requirements.txt b/site/requirements.txt index f7877c7a6985..130842d75f92 100644 --- a/site/requirements.txt +++ b/site/requirements.txt @@ -20,6 +20,6 @@ mkdocs-macros-plugin==1.5.0 mkdocs-material==9.7.5 mkdocs-material-extensions==1.3.1 mkdocs-monorepo-plugin @ git+https://github.com/bitsondatadev/mkdocs-monorepo-plugin@url-fix -mkdocs-redirects==1.2.2 +mkdocs-redirects==1.2.3 mkdocs-rss-plugin==1.17.9 pymarkdownlnt==0.9.36 From 4575b6a278ea7ea2c987fed231545668b7e3a7d2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Apr 2026 21:51:51 -0700 Subject: [PATCH 017/197] Build: Bump astral-sh/setup-uv from 7.6.0 to 8.0.0 (#15888) Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 7.6.0 to 8.0.0. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/37802adc94f370d6bfd71619e3f0bf239e1f3b78...cec208311dfd045dd5311c1add060b2062131d57) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-version: 8.0.0 dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/open-api.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/open-api.yml b/.github/workflows/open-api.yml index b57bbcdad177..2d58d0dcf023 100644 --- a/.github/workflows/open-api.yml +++ b/.github/workflows/open-api.yml @@ -48,7 +48,7 @@ jobs: with: persist-credentials: false - name: Install uv - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 with: enable-cache: false - name: Install dependencies From cc90c0e862e272026ac32f8d0bf6d818b1dc0248 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Apr 2026 22:56:02 -0700 Subject: [PATCH 018/197] Build: Bump org.openapitools:openapi-generator-gradle-plugin (#15886) Bumps [org.openapitools:openapi-generator-gradle-plugin](https://github.com/OpenAPITools/openapi-generator) from 7.20.0 to 7.21.0. - [Release notes](https://github.com/OpenAPITools/openapi-generator/releases) - [Changelog](https://github.com/OpenAPITools/openapi-generator/blob/master/docs/release-summary.md) - [Commits](https://github.com/OpenAPITools/openapi-generator/compare/v7.20.0...v7.21.0) --- updated-dependencies: - dependency-name: org.openapitools:openapi-generator-gradle-plugin dependency-version: 7.21.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 95163d9d614c..6200d53a172e 100644 --- a/build.gradle +++ b/build.gradle @@ -36,7 +36,7 @@ buildscript { classpath 'org.revapi:gradle-revapi:1.8.0' classpath 'com.gorylenko.gradle-git-properties:gradle-git-properties:2.5.7' classpath 'com.palantir.gradle.gitversion:gradle-git-version:4.3.0' - classpath 'org.openapitools:openapi-generator-gradle-plugin:7.20.0' + classpath 'org.openapitools:openapi-generator-gradle-plugin:7.21.0' } } From 85cd18d31652c4b8f80876ef7d6aed2eb6e9ec0b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Apr 2026 22:56:17 -0700 Subject: [PATCH 019/197] Build: Bump com.google.cloud:libraries-bom from 26.78.0 to 26.79.0 (#15889) Bumps [com.google.cloud:libraries-bom](https://github.com/googleapis/java-cloud-bom) from 26.78.0 to 26.79.0. - [Release notes](https://github.com/googleapis/java-cloud-bom/releases) - [Commits](https://github.com/googleapis/java-cloud-bom/compare/v26.78.0...v26.79.0) --- updated-dependencies: - dependency-name: com.google.cloud:libraries-bom dependency-version: 26.79.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 3a2f277e4568..2f2fc9699c45 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -50,7 +50,7 @@ findbugs-jsr305 = "3.0.2" flink120 = { strictly = "1.20.1"} flink20 = { strictly = "2.0.0"} flink21 = { strictly = "2.1.0"} -google-libraries-bom = "26.78.0" +google-libraries-bom = "26.79.0" gcs-analytics-core = "1.2.3" guava = "33.5.0-jre" hadoop3 = "3.4.3" From c6ff39a69fbac7f6f059b2e0cca2fb0a0cfde7be Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Apr 2026 22:56:30 -0700 Subject: [PATCH 020/197] Build: Bump software.amazon.awssdk:bom from 2.42.18 to 2.42.23 (#15890) Bumps software.amazon.awssdk:bom from 2.42.18 to 2.42.23. --- updated-dependencies: - dependency-name: software.amazon.awssdk:bom dependency-version: 2.42.23 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 2f2fc9699c45..4ac81332e7a6 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,7 +33,7 @@ arrow = "15.0.2" avro = "1.12.1" assertj-core = "3.27.7" awaitility = "4.3.0" -awssdk-bom = "2.42.18" +awssdk-bom = "2.42.23" azuresdk-bom = "1.3.5" awssdk-s3accessgrants = "2.4.1" bson-ver = "4.11.5" From a18ae8a85e8716bd7e1e4d99f23fbc5bcc403cdc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:34:02 -0700 Subject: [PATCH 021/197] Build: Bump jetty from 12.1.5 to 12.1.7 (#15887) Bumps `jetty` from 12.1.5 to 12.1.7. Updates `org.eclipse.jetty:jetty-server` from 12.1.5 to 12.1.7 Updates `org.eclipse.jetty.ee10:jetty-ee10-servlet` from 12.1.5 to 12.1.7 --- updated-dependencies: - dependency-name: org.eclipse.jetty:jetty-server dependency-version: 12.1.7 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.eclipse.jetty.ee10:jetty-ee10-servlet dependency-version: 12.1.7 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 4ac81332e7a6..acce91dc538c 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -65,7 +65,7 @@ jakarta-el-api = "3.0.3" jakarta-servlet-api = "6.1.0" jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" -jetty = "12.1.5" +jetty = "12.1.7" junit = "5.14.3" junit-platform = "1.14.3" junit-pioneer = "2.3.0" From b25cb522ccaddacc4e7be4dcf48fc80dd7b823dd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:34:18 -0700 Subject: [PATCH 022/197] Build: Bump io.netty:netty-buffer from 4.2.10.Final to 4.2.12.Final (#15891) Bumps [io.netty:netty-buffer](https://github.com/netty/netty) from 4.2.10.Final to 4.2.12.Final. - [Release notes](https://github.com/netty/netty/releases) - [Commits](https://github.com/netty/netty/compare/netty-4.2.10.Final...netty-4.2.12.Final) --- updated-dependencies: - dependency-name: io.netty:netty-buffer dependency-version: 4.2.12.Final dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index acce91dc538c..4f04662ba60b 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -76,7 +76,7 @@ microprofile-openapi-api = "3.1.2" mockito = "4.11.0" mockserver = "5.15.0" nessie = "0.107.4" -netty-buffer = "4.2.10.Final" +netty-buffer = "4.2.12.Final" object-client-bundle = "3.3.2" orc = "1.9.8" parquet = "1.17.0" From ec2de913bfea2e1799aa2b72a9a9796418a8d839 Mon Sep 17 00:00:00 2001 From: Jiajia Li Date: Mon, 6 Apr 2026 04:56:13 +0800 Subject: [PATCH 023/197] AWS: Add chunked encoding configuration for S3 requests (#15242) * AWS: Add chunked encoding configuration for S3 requests * add testMultipartUploadWithChunkedEncodingDisabled * update open api define * update * update default value * update case * assert file contents in testMultipartUploadWithChunkedEncoding * Remove s3.chunked-encoding-enabled config entry from REST catalog open API spec * Use IOUtil.readFully for reliable reads in TestS3MultipartUpload * ensure testIo is properly closed * retrigger CI * Change chunked encoding default to true to match AWS SDK behavior * Fix test to verify explicit disable of chunked encoding instead of duplicating default --- .../iceberg/aws/s3/TestS3MultipartUpload.java | 90 +++++++++++++++++++ .../iceberg/aws/s3/S3FileIOProperties.java | 22 +++++ .../aws/s3/TestS3FileIOProperties.java | 21 +++++ 3 files changed, 133 insertions(+) diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java index cbe3051a6711..746015098a40 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/TestS3MultipartUpload.java @@ -21,12 +21,14 @@ import static org.assertj.core.api.Assertions.assertThat; import java.io.IOException; +import java.util.Arrays; import java.util.Random; import java.util.UUID; import java.util.function.Supplier; import java.util.stream.IntStream; import org.apache.iceberg.aws.AwsClientFactories; import org.apache.iceberg.aws.AwsIntegTestUtil; +import org.apache.iceberg.io.IOUtil; import org.apache.iceberg.io.PositionOutputStream; import org.apache.iceberg.io.SeekableInputStream; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -36,6 +38,8 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariable; import org.junit.jupiter.api.condition.EnabledIfEnvironmentVariables; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import software.amazon.awssdk.services.s3.S3Client; /** Long-running tests to ensure multipart upload logic is resilient */ @@ -141,6 +145,35 @@ public void testParallelUpload() throws IOException { } } + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMultipartUploadWithChunkedEncoding(boolean chunkedEncodingEnabled) + throws IOException { + // Create a new S3FileIO with specified chunked encoding setting + try (S3FileIO testIo = new S3FileIO(() -> s3)) { + testIo.initialize( + ImmutableMap.of( + S3FileIOProperties.MULTIPART_SIZE, + Integer.toString(S3FileIOProperties.MULTIPART_SIZE_MIN), + S3FileIOProperties.CHECKSUM_ENABLED, + "true", + S3FileIOProperties.CHUNKED_ENCODING_ENABLED, + Boolean.toString(chunkedEncodingEnabled))); + + int parts = 10; + long partSize = S3FileIOProperties.MULTIPART_SIZE_MIN; + String suffix = chunkedEncodingEnabled ? "-chunked-enabled" : "-chunked-disabled"; + + String intObjectUri = objectUri + suffix + "-int"; + writeDistinctPartsWithInts(testIo, intObjectUri, parts, partSize); + verifyDistinctPartsWithInts(testIo, intObjectUri, parts, partSize); + + String bytesObjectUri = objectUri + suffix + "-bytes"; + writeDistinctPartsWithBytes(testIo, bytesObjectUri, parts, partSize); + verifyDistinctPartsWithBytes(testIo, bytesObjectUri, parts, partSize); + } + } + private void writeInts(String fileUri, int parts, Supplier writer) { writeInts(fileUri, parts, S3FileIOProperties.MULTIPART_SIZE_MIN, writer); } @@ -177,4 +210,61 @@ private void writeBytes(String fileUri, int parts, Supplier writer) { throw new RuntimeException(e); } } + + private void writeDistinctPartsWithInts(S3FileIO fileIO, String fileUri, int parts, long partSize) + throws IOException { + try (PositionOutputStream outputStream = fileIO.newOutputFile(fileUri).create()) { + for (int part = 0; part < parts; part++) { + int partByte = part + 1; + for (long j = 0; j < partSize; j++) { + outputStream.write(partByte); + } + } + } + + assertThat(fileIO.newInputFile(fileUri).getLength()).isEqualTo(parts * partSize); + } + + private void verifyDistinctPartsWithInts( + S3FileIO fileIO, String fileUri, int parts, long partSize) throws IOException { + try (SeekableInputStream inputStream = fileIO.newInputFile(fileUri).newStream()) { + byte[] readBuffer = new byte[(int) partSize]; + for (int part = 0; part < parts; part++) { + byte expectedByte = (byte) (part + 1); + IOUtil.readFully(inputStream, readBuffer, 0, (int) partSize); + for (int i = 0; i < (int) partSize; i++) { + assertThat(readBuffer[i]).as("part %d, offset %d", part, i).isEqualTo(expectedByte); + } + } + assertThat(inputStream.read()).as("expected end of stream").isEqualTo(-1); + } + } + + private void writeDistinctPartsWithBytes( + S3FileIO fileIO, String fileUri, int parts, long partSize) throws IOException { + try (PositionOutputStream outputStream = fileIO.newOutputFile(fileUri).create()) { + for (int part = 0; part < parts; part++) { + byte[] partBytes = new byte[(int) partSize]; + Arrays.fill(partBytes, (byte) (part + 1)); + outputStream.write(partBytes); + } + } + + assertThat(fileIO.newInputFile(fileUri).getLength()).isEqualTo(parts * partSize); + } + + private void verifyDistinctPartsWithBytes( + S3FileIO fileIO, String fileUri, int parts, long partSize) throws IOException { + try (SeekableInputStream inputStream = fileIO.newInputFile(fileUri).newStream()) { + byte[] readBuffer = new byte[(int) partSize]; + for (int part = 0; part < parts; part++) { + byte expectedByte = (byte) (part + 1); + IOUtil.readFully(inputStream, readBuffer, 0, (int) partSize); + for (int i = 0; i < (int) partSize; i++) { + assertThat(readBuffer[i]).as("part %d, offset %d", part, i).isEqualTo(expectedByte); + } + } + assertThat(inputStream.read()).as("expected end of stream").isEqualTo(-1); + } + } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java index ad5181fd2798..922010d61d27 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java @@ -295,6 +295,18 @@ public class S3FileIOProperties implements Serializable { public static final boolean REMOTE_SIGNING_ENABLED_DEFAULT = false; + /** + * Enables or disables chunked encoding for S3 requests. + * + *

This feature is enabled by default to match the AWS SDK default behavior. + * + *

For more details see: + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/services/s3/S3Configuration.html#chunkedEncodingEnabled() + */ + public static final String CHUNKED_ENCODING_ENABLED = "s3.chunked-encoding-enabled"; + + public static final boolean CHUNKED_ENCODING_ENABLED_DEFAULT = true; + /** Configure the batch size used when deleting multiple files from a given S3 bucket */ public static final String DELETE_BATCH_SIZE = "s3.delete.batch-size"; @@ -509,6 +521,7 @@ public class S3FileIOProperties implements Serializable { private String stagingDirectory; private ObjectCannedACL acl; private boolean isChecksumEnabled; + private boolean isChunkedEncodingEnabled; private final Set writeTags; private boolean isWriteTableTagEnabled; private boolean isWriteNamespaceTagEnabled; @@ -551,6 +564,7 @@ public S3FileIOProperties() { this.deleteBatchSize = DELETE_BATCH_SIZE_DEFAULT; this.stagingDirectory = System.getProperty("java.io.tmpdir"); this.isChecksumEnabled = CHECKSUM_ENABLED_DEFAULT; + this.isChunkedEncodingEnabled = CHUNKED_ENCODING_ENABLED_DEFAULT; this.writeTags = Sets.newHashSet(); this.isWriteTableTagEnabled = WRITE_TABLE_TAG_ENABLED_DEFAULT; this.isWriteNamespaceTagEnabled = WRITE_NAMESPACE_TAG_ENABLED_DEFAULT; @@ -641,6 +655,9 @@ public S3FileIOProperties(Map properties) { "Cannot support S3 CannedACL " + aclType); this.isChecksumEnabled = PropertyUtil.propertyAsBoolean(properties, CHECKSUM_ENABLED, CHECKSUM_ENABLED_DEFAULT); + this.isChunkedEncodingEnabled = + PropertyUtil.propertyAsBoolean( + properties, CHUNKED_ENCODING_ENABLED, CHUNKED_ENCODING_ENABLED_DEFAULT); this.deleteBatchSize = PropertyUtil.propertyAsInt(properties, DELETE_BATCH_SIZE, DELETE_BATCH_SIZE_DEFAULT); Preconditions.checkArgument( @@ -808,6 +825,10 @@ public boolean isChecksumEnabled() { return this.isChecksumEnabled; } + public boolean isChunkedEncodingEnabled() { + return this.isChunkedEncodingEnabled; + } + public boolean isRemoteSigningEnabled() { return this.isRemoteSigningEnabled; } @@ -994,6 +1015,7 @@ public void applyServiceConfigurations(T builder) { .pathStyleAccessEnabled(isPathStyleAccess) .useArnRegionEnabled(isUseArnRegionEnabled) .accelerateModeEnabled(isAccelerationEnabled) + .chunkedEncodingEnabled(isChunkedEncodingEnabled) .build()); } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java index 1666de1f1d08..953f73d45d4a 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOProperties.java @@ -566,4 +566,25 @@ public void testApplyRetryConfiguration() { RetryPolicy retryPolicy = builder.overrideConfiguration().retryPolicy().get(); assertThat(retryPolicy.numRetries()).as("retries was not set").isEqualTo(999); } + + @Test + public void testChunkedEncodingEnabledDefaultValue() { + Map properties = Maps.newHashMap(); + S3FileIOProperties s3FileIOProperties = new S3FileIOProperties(properties); + + assertThat(s3FileIOProperties.isChunkedEncodingEnabled()) + .as("chunked encoding should be enabled by default") + .isTrue(); + } + + @Test + public void testChunkedEncodingDisabled() { + Map properties = Maps.newHashMap(); + properties.put(S3FileIOProperties.CHUNKED_ENCODING_ENABLED, "false"); + S3FileIOProperties s3FileIOProperties = new S3FileIOProperties(properties); + + assertThat(s3FileIOProperties.isChunkedEncodingEnabled()) + .as("chunked encoding should be disabled when explicitly set to false") + .isFalse(); + } } From e1a2713d18ae62ae91512ebbde052d137c942861 Mon Sep 17 00:00:00 2001 From: Rahul Shivu Mahadev <51690557+rahulsmahadev@users.noreply.github.com> Date: Tue, 7 Apr 2026 08:46:45 -0700 Subject: [PATCH 024/197] Core : Make REST scan planning poll timeout configurable (#15863) * Make MAX_WAIT_TIME_MS configurable for RESTTableScan * fix style * fix checkstyle: add hasMessage check to assertThatThrownBy Co-authored-by: Isaac * Address Amogh's comments * address comments --- .../iceberg/rest/RESTCatalogProperties.java | 5 + .../apache/iceberg/rest/RESTTableScan.java | 84 +++++++---- .../rest/RemotePlanTimeoutException.java | 26 ++++ .../iceberg/rest/TestRESTScanPlanning.java | 131 ++++++++++++++++++ 4 files changed, 216 insertions(+), 30 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/rest/RemotePlanTimeoutException.java diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java b/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java index cda81b1d0d65..c79bf2477228 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java @@ -49,6 +49,11 @@ private RESTCatalogProperties() {} public static final String REST_SCAN_PLAN_ID = "rest-scan-plan-id"; + public static final String REST_SCAN_PLANNING_POLL_TIMEOUT_MS = + "rest-scan-planning.poll-timeout-ms"; + public static final long REST_SCAN_PLANNING_POLL_TIMEOUT_MS_DEFAULT = + TimeUnit.MINUTES.toMillis(5); + // Properties that control the behaviour of the table cache used for freshness-aware table // loading. public static final String TABLE_CACHE_EXPIRE_AFTER_WRITE_MS = diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java index 74fe9ebd7d4e..2a39bf1105d8 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java @@ -22,6 +22,7 @@ import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.RemovalListener; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; @@ -50,6 +51,7 @@ import org.apache.iceberg.rest.responses.FetchPlanningResultResponse; import org.apache.iceberg.rest.responses.PlanTableScanResponse; import org.apache.iceberg.types.TypeUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.util.Tasks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,7 +61,6 @@ class RESTTableScan extends DataTableScan { private static final long MIN_SLEEP_MS = 1000; // Initial delay private static final long MAX_SLEEP_MS = 60 * 1000; // Max backoff delay (1 minute) private static final int MAX_RETRIES = 10; // Max number of poll retries - private static final long MAX_WAIT_TIME_MS = 5 * 60 * 1000; // Total maximum duration (5 minutes) private static final double SCALE_FACTOR = 2.0; // Exponential scale factor private static final String DEFAULT_FILE_IO_IMPL = "org.apache.iceberg.io.ResolvingFileIO"; private static final Cache FILEIO_TRACKER = @@ -249,38 +250,61 @@ private FileIO scanFileIO(List storageCredentials) { } private CloseableIterable fetchPlanningResult() { + long maxWaitTimeMs = + PropertyUtil.propertyAsLong( + catalogProperties, + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS_DEFAULT); + Preconditions.checkArgument( + maxWaitTimeMs > 0, + "Invalid value for %s: %s (must be positive)", + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + maxWaitTimeMs); + AtomicReference result = new AtomicReference<>(); - Tasks.foreach(planId) - .exponentialBackoff(MIN_SLEEP_MS, MAX_SLEEP_MS, MAX_WAIT_TIME_MS, SCALE_FACTOR) - .retry(MAX_RETRIES) - .onlyRetryOn(NotCompleteException.class) - .onFailure( - (id, err) -> { - LOG.warn("Planning failed for plan ID: {}", id, err); - cleanupPlanResources(); - }) - .throwFailureWhenFinished() - .run( - id -> { - FetchPlanningResultResponse response = - client.get( - resourcePaths.plan(tableIdentifier, id), - headers, - FetchPlanningResultResponse.class, - headers, - ErrorHandlers.planErrorHandler(), - parserContext); + try { + Tasks.foreach(planId) + .exponentialBackoff(MIN_SLEEP_MS, MAX_SLEEP_MS, maxWaitTimeMs, SCALE_FACTOR) + .retry(MAX_RETRIES) + .onlyRetryOn(NotCompleteException.class) + .onFailure( + (id, err) -> { + LOG.warn("Planning failed for plan ID: {}", id, err); + cleanupPlanResources(); + }) + .throwFailureWhenFinished() + .run( + id -> { + FetchPlanningResultResponse response = + client.get( + resourcePaths.plan(tableIdentifier, id), + headers, + FetchPlanningResultResponse.class, + headers, + ErrorHandlers.planErrorHandler(), + parserContext); - if (response.planStatus() == PlanStatus.SUBMITTED) { - throw new NotCompleteException(); - } else if (response.planStatus() != PlanStatus.COMPLETED) { - throw new IllegalStateException( - String.format( - "Invalid planStatus: %s for planId: %s", response.planStatus(), id)); - } + if (response.planStatus() == PlanStatus.SUBMITTED) { + throw new NotCompleteException(); + } else if (response.planStatus() != PlanStatus.COMPLETED) { + throw new IllegalStateException( + String.format( + "Invalid planStatus: %s for planId: %s", response.planStatus(), id)); + } - result.set(response); - }); + result.set(response); + }); + } catch (NotCompleteException e) { + throw new RemotePlanTimeoutException( + String.format( + Locale.ROOT, + "Remote scan planning for planId: %s did not complete within configured limits" + + " (timeout=%d ms, maxRetries=%d)", + planId, + maxWaitTimeMs, + MAX_RETRIES), + e); + } FetchPlanningResultResponse response = result.get(); diff --git a/core/src/main/java/org/apache/iceberg/rest/RemotePlanTimeoutException.java b/core/src/main/java/org/apache/iceberg/rest/RemotePlanTimeoutException.java new file mode 100644 index 000000000000..e0f01aadd612 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/RemotePlanTimeoutException.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest; + +/** Thrown when server-side scan planning does not complete before the client deadline. */ +class RemotePlanTimeoutException extends RuntimeException { + RemotePlanTimeoutException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java index a7fbe43463ac..6fc67727bf23 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java @@ -1153,6 +1153,137 @@ public void serverSupportsPlanningButNotCancellation() throws IOException { assertThat(cancelled).isFalse(); } + @Test + public void asyncPlanningRespectsConfigurablePollTimeout() { + // Create an adapter that always returns SUBMITTED (never completes) + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + RESTCatalogAdapter adapter = + Mockito.spy( + new RESTCatalogAdapter(backendCatalog) { + @Override + public T execute( + HTTPRequest request, + Class responseType, + Consumer errorHandler, + Consumer> responseHeaders, + ParserContext parserContext) { + if (ResourcePaths.config().equals(request.path())) { + return castResponse( + responseType, ConfigResponse.builder().withEndpoints(endpoints).build()); + } + T response = + super.execute( + request, responseType, errorHandler, responseHeaders, parserContext); + if (response instanceof LoadTableResponse) { + return castResponse( + responseType, + withPlanningMode( + (LoadTableResponse) response, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName())); + } + + // Override fetch responses to always return SUBMITTED so the poll never completes + if (response instanceof FetchPlanningResultResponse) { + return castResponse( + responseType, + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.SUBMITTED) + .build()); + } + + return response; + } + }); + + adapter.setPlanningBehavior(TestPlanningBehavior.builder().asynchronous().build()); + + RESTCatalog catalog = + new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); + catalog.initialize( + "test-poll-timeout", + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName(), + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + "1")); + + RESTTable table = restTableFor(catalog, "poll_timeout_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + // With a 1ms timeout and a server that never completes, planFiles should fail + assertThatThrownBy(scan::planFiles) + .isInstanceOf(RemotePlanTimeoutException.class) + .hasMessageContaining("did not complete within configured limits"); + } + + @Test + public void asyncPlanningSucceedsWithCustomTimeout() { + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + CatalogWithAdapter catalogWithAdapter = + catalogWithEndpoints(endpoints, TestPlanningBehavior.builder().asynchronous().build()); + + catalogWithAdapter.catalog.initialize( + "test-custom-timeout", + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName(), + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + "30000")); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "custom_timeout_success"); + setParserContext(table); + assertThat(table.newScan().planFiles()).hasSize(1); + } + + @Test + public void asyncPlanningRejectsInvalidTimeout() { + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + CatalogWithAdapter catalogWithAdapter = + catalogWithEndpoints(endpoints, TestPlanningBehavior.builder().asynchronous().build()); + + // re-initialize with an invalid timeout + catalogWithAdapter.catalog.initialize( + "test-invalid-timeout", + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName(), + RESTCatalogProperties.REST_SCAN_PLANNING_POLL_TIMEOUT_MS, + "-1")); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "invalid_timeout_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + assertThatThrownBy(scan::planFiles) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("must be positive"); + } + @ParameterizedTest @EnumSource(PlanningMode.class) void fileIOForRemotePlanningIsPropagated( From 66870849f2fa1b40f512971a9ad86bf912e7649d Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 10 Apr 2026 10:07:38 -0700 Subject: [PATCH 025/197] Spark 4.1: Add runtime-deps.txt. (#15860) --- spark/v4.1/spark-runtime/runtime-deps.txt | 52 +++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 spark/v4.1/spark-runtime/runtime-deps.txt diff --git a/spark/v4.1/spark-runtime/runtime-deps.txt b/spark/v4.1/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..a662942f9cd1 --- /dev/null +++ b/spark/v4.1/spark-runtime/runtime-deps.txt @@ -0,0 +1,52 @@ +com.aliyun:credentials-java:0.3.12 +com.aliyun:tea:1.4.1 +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.15.2 +com.fasterxml.jackson.core:jackson-databind:2.15.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.2 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.code.gson:gson:2.11.0 +com.google.errorprone:error_prone_annotations:2.27.0 +com.google.flatbuffers:flatbuffers-java:23.5.26 +com.squareup.okhttp3:okhttp:4.12.0 +com.squareup.okio:okio-jvm:3.6.0 +com.sun.xml.bind:jaxb-core:2.3.0 +com.sun.xml.bind:jaxb-impl:2.3.0 +dev.failsafe:failsafe:3.3.2 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.10.Final +io.netty:netty-common:4.2.10.Final +org.apache.arrow:arrow-format:15.0.2 +org.apache.arrow:arrow-memory-core:15.0.2 +org.apache.arrow:arrow-memory-netty:15.0.2 +org.apache.arrow:arrow-vector:15.0.2 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.collections:eclipse-collections-api:11.1.0 +org.eclipse.collections:eclipse-collections:11.1.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.jacoco:org.jacoco.agent:0.8.8 +org.jetbrains.kotlin:kotlin-stdlib-common:1.9.10 +org.jetbrains.kotlin:kotlin-stdlib-jdk7:1.9.10 +org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.9.10 +org.jetbrains.kotlin:kotlin-stdlib:1.9.10 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.4 +org.projectnessie.nessie:nessie-model:0.107.4 +org.roaringbitmap:RoaringBitmap:1.6.13 +org.threeten:threeten-extra:1.7.1 From 9586899fab3cc15fc0d0a5c9060b98ff632974e6 Mon Sep 17 00:00:00 2001 From: Wing Yew Poon Date: Fri, 10 Apr 2026 13:45:54 -0700 Subject: [PATCH 026/197] Update documentation on Spark migrate procedure (#15874) ... in light of https://github.com/apache/iceberg/pull/15429. --- docs/docs/spark-procedures.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docs/spark-procedures.md b/docs/docs/spark-procedures.md index 45e55a9365a5..8e594caa12d4 100644 --- a/docs/docs/spark-procedures.md +++ b/docs/docs/spark-procedures.md @@ -637,6 +637,7 @@ Replace a table with an Iceberg table, loaded with the source's data files. Table schema, partitioning, properties, and location will be copied from the source table. Migrate will fail if any table partition uses an unsupported format. Supported formats are Avro, Parquet, and ORC. +Migrate will also fail if the table is bucketed, as the bucketing will not be preserved. Existing data files are added to the Iceberg table's metadata and can be read using a name-to-id mapping created from the original table schema. To leave the original table intact while testing, use [`snapshot`](#snapshot) to create new temporary table that shares source data files and schema. From 1b27a582d385d4b3341e8246f5327af237c41052 Mon Sep 17 00:00:00 2001 From: jackylee Date: Sat, 11 Apr 2026 04:52:39 +0800 Subject: [PATCH 027/197] Docs: Add Hive Metastore schema validation warnings for schema evolution with Hive catalog (#15814) * Docs: Add Hive Metastore schema validation warnings for DROP COLUMN and REORDER When using a Hive catalog, ALTER TABLE DROP COLUMN (non-last column) and ALTER COLUMN REORDER fail because the Hive Metastore validates schema changes by comparing column types positionally. Dropping a middle column shifts subsequent columns, causing HMS to reject the change as an incompatible type change via MetaStoreUtils#throwExceptionIfIncompatibleColTypeChange. Add warning admonitions to spark-ddl.md (DROP COLUMN and REORDER sections) and flink-ddl.md (Hive catalog section) documenting the limitation, workaround (hive.metastore.disallow.incompatible.col.type.changes=false), and trade-off (Hive engine can no longer read the table). * Docs: Clarify HMS workaround for embedded vs remote deployment * Docs: add more warning for spark-ddl.md * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Manu Zhang --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Manu Zhang --- docs/docs/flink-ddl.md | 17 +++++++++++++++++ docs/docs/spark-ddl.md | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/docs/docs/flink-ddl.md b/docs/docs/flink-ddl.md index 756256f0df4f..0a9b26712235 100644 --- a/docs/docs/flink-ddl.md +++ b/docs/docs/flink-ddl.md @@ -45,6 +45,23 @@ The following properties can be set if using the Hive catalog: * `hive-conf-dir`: Path to a directory containing a `hive-site.xml` configuration file which will be used to provide custom Hive configuration values. The value of `hive.metastore.warehouse.dir` from `/hive-site.xml` (or hive configure file from classpath) will be overwritten with the `warehouse` value if setting both `hive-conf-dir` and `warehouse` when creating iceberg catalog. * `hadoop-conf-dir`: Path to a directory containing `core-site.xml` and `hdfs-site.xml` configuration files which will be used to provide custom Hadoop configuration values. +!!! warning "Hive Catalog Limitation" + The Hive Metastore (HMS) validates schema changes by comparing column types **positionally** + (`hive.metastore.disallow.incompatible.col.type.changes`, default `true`). When using a Hive catalog, + schema evolution operations that change column positions — such as dropping a non-last column or + reordering columns — may fail regardless of which engine performs the change (Spark, Flink Java API, etc.). + + To work around this, disable the HMS schema compatibility check by setting + `hive.metastore.disallow.incompatible.col.type.changes=false`: + + - **Remote HMS:** Set this property in the HMS server's `hive-site.xml`. + - **Embedded HMS:** Add the equivalent property to the Hive catalog configuration. + + **Trade-off:** After disabling this check, the Hive engine may no longer be able to read the table + correctly due to the schema mismatch in the Hive Metastore. Iceberg-aware engines (Spark, Flink, + Trino, etc.) will continue to work correctly, as they read schema from Iceberg metadata rather + than the Hive Metastore. + #### Hadoop catalog Iceberg also supports a directory-based catalog in HDFS that can be configured using `'catalog-type'='hadoop'`: diff --git a/docs/docs/spark-ddl.md b/docs/docs/spark-ddl.md index 4d227c2db4f9..9fa6c0e7d3c7 100644 --- a/docs/docs/spark-ddl.md +++ b/docs/docs/spark-ddl.md @@ -173,6 +173,27 @@ Iceberg has full `ALTER TABLE` support in Spark 3, including: In addition, [SQL extensions](spark-configuration.md#sql-extensions) can be used to add support for partition evolution and setting a table's write order +!!! warning "Hive Catalog Limitation" + The Hive Metastore (HMS) validates schema changes by comparing column types **positionally** + (`hive.metastore.disallow.incompatible.col.type.changes`, default `true`). Any schema evolution + operation that shifts column positions will fail when using a Hive catalog. Affected operations + include: + + - `ADD COLUMN` with `FIRST` or `AFTER` clauses + - `ALTER COLUMN` with `FIRST` or `AFTER` clauses (reordering) + - `DROP COLUMN` on a non-last column + + To work around this, disable the HMS schema compatibility check by setting + `hive.metastore.disallow.incompatible.col.type.changes=false`: + + - **Remote HMS:** Set this property in the HMS server's `hive-site.xml`. + - **Embedded HMS:** Pass `--conf spark.hadoop.hive.metastore.disallow.incompatible.col.type.changes=false` when starting Spark. + + **Trade-off:** After disabling this check, the Hive engine may no longer be able to read the table + correctly due to the schema mismatch in the Hive Metastore. Iceberg-aware engines (Spark, Flink, + Trino, etc.) will continue to work correctly, as they read schema from Iceberg metadata rather + than HMS. + ### `ALTER TABLE ... RENAME TO` ```sql @@ -262,6 +283,11 @@ ALTER TABLE prod.db.sample ADD COLUMN nested.new_column bigint FIRST; ``` +!!! warning "Hive Catalog Limitation" + When using a Hive catalog, adding a column with `FIRST` or `AFTER` may fail due to HMS positional + schema validation. See the warning above for details + and workaround. + ### `ALTER TABLE ... RENAME COLUMN` Iceberg allows any field to be renamed. To rename a field, use `RENAME COLUMN`: @@ -305,6 +331,10 @@ ALTER TABLE prod.db.sample ALTER COLUMN col FIRST; ALTER TABLE prod.db.sample ALTER COLUMN nested.col AFTER other_col; ``` +!!! warning "Hive Catalog Limitation" + When using a Hive catalog, reordering columns may fail due to HMS positional schema validation. + See the Hive Catalog Limitation note above for details and workaround. + Nullability for a non-nullable column can be changed using `DROP NOT NULL`: ```sql @@ -326,6 +356,11 @@ ALTER TABLE prod.db.sample DROP COLUMN id; ALTER TABLE prod.db.sample DROP COLUMN point.z; ``` +!!! warning "Hive Catalog Limitation" + When using a Hive catalog, dropping a non-last column may fail due to HMS positional schema + validation. See the earlier Hive Catalog Limitation warning above for details and + workaround. + ## `ALTER TABLE` SQL extensions These commands are available in Spark 3 when using Iceberg [SQL extensions](spark-configuration.md#sql-extensions). From 4c215926a0f42e7d682339193606fa5775ce07b0 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Fri, 10 Apr 2026 16:35:33 -0700 Subject: [PATCH 028/197] Build: Fix zizmor and Spark 4.1 runtime-deps CI failures (#15937) Fix zizmor ref-version-mismatch audit failure caused by the rolling v7 tag moving to v7.0.1 while workflows pinned the v7.0.0 hash. Regenerate Spark 4.1 runtime-deps.txt to reflect dependency changes from recent dependabot bumps. Made-with: Cursor Co-authored-by: Neelesh Salian --- .github/workflows/api-binary-compatibility.yml | 2 +- .github/workflows/delta-conversion-ci.yml | 4 ++-- .github/workflows/flink-ci.yml | 2 +- .github/workflows/hive-ci.yml | 2 +- .github/workflows/java-ci.yml | 2 +- .github/workflows/jmh-benchmarks.yml | 2 +- .github/workflows/kafka-connect-ci.yml | 2 +- .github/workflows/recurring-jmh-benchmarks.yml | 2 +- .github/workflows/spark-ci.yml | 2 +- spark/v4.1/spark-runtime/runtime-deps.txt | 18 +++--------------- 10 files changed, 13 insertions(+), 25 deletions(-) diff --git a/.github/workflows/api-binary-compatibility.yml b/.github/workflows/api-binary-compatibility.yml index da04904fb769..d91ba210ab61 100644 --- a/.github/workflows/api-binary-compatibility.yml +++ b/.github/workflows/api-binary-compatibility.yml @@ -63,7 +63,7 @@ jobs: - run: | echo "Using the old version tag, as per git describe, of $(git describe)"; - run: ./gradlew revapi --rerun-tasks - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/delta-conversion-ci.yml b/.github/workflows/delta-conversion-ci.yml index 4c576e88a896..aac791ab8b1e 100644 --- a/.github/workflows/delta-conversion-ci.yml +++ b/.github/workflows/delta-conversion-ci.yml @@ -90,7 +90,7 @@ jobs: - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.12 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs @@ -116,7 +116,7 @@ jobs: - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.13 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index 8f49b1c6242f..3f346b21846e 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -94,7 +94,7 @@ jobs: - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DkafkaVersions= -DflinkVersions=${{ matrix.flink }} :iceberg-flink:iceberg-flink-${{ matrix.flink }}:check :iceberg-flink:iceberg-flink-runtime-${{ matrix.flink }}:check -Pquick=true -x javadoc -DtestParallelism=auto - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/hive-ci.yml b/.github/workflows/hive-ci.yml index 8effb1d9e9a0..3f9adf2dee9f 100644 --- a/.github/workflows/hive-ci.yml +++ b/.github/workflows/hive-ci.yml @@ -91,7 +91,7 @@ jobs: - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true :iceberg-mr:check -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index e8ac497ab04a..5cc8b198a150 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -86,7 +86,7 @@ jobs: - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew check -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/jmh-benchmarks.yml b/.github/workflows/jmh-benchmarks.yml index 354bb1e106f4..9b01352d9aaa 100644 --- a/.github/workflows/jmh-benchmarks.yml +++ b/.github/workflows/jmh-benchmarks.yml @@ -113,7 +113,7 @@ jobs: BENCHMARK: ${{ matrix.benchmark }} run: ./gradlew -DsparkVersions=${SPARK_VERSION} -DscalaVersion=${SCALA_VERSION} :iceberg-spark:iceberg-spark-${SPARK_VERSION}_${SCALA_VERSION}:jmh -PjmhIncludeRegex=${BENCHMARK} -PjmhOutputPath=benchmark/${BENCHMARK}.txt - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: ${{ always() }} with: name: benchmark-${{ matrix.benchmark }} diff --git a/.github/workflows/kafka-connect-ci.yml b/.github/workflows/kafka-connect-ci.yml index fc86b77bcefc..c0490fd6981c 100644 --- a/.github/workflows/kafka-connect-ci.yml +++ b/.github/workflows/kafka-connect-ci.yml @@ -97,7 +97,7 @@ jobs: :iceberg-kafka-connect:iceberg-kafka-connect:check \ :iceberg-kafka-connect:iceberg-kafka-connect-runtime:check \ -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/.github/workflows/recurring-jmh-benchmarks.yml b/.github/workflows/recurring-jmh-benchmarks.yml index da918a6972b4..4ca900746289 100644 --- a/.github/workflows/recurring-jmh-benchmarks.yml +++ b/.github/workflows/recurring-jmh-benchmarks.yml @@ -64,7 +64,7 @@ jobs: - name: Run Benchmark run: ./gradlew -DsparkVersions=${{ matrix.spark }} -DscalaVersion=${{ matrix.scala }} :iceberg-spark:iceberg-spark-${{ matrix.spark }}_${{ matrix.scala }}:jmh -PjmhIncludeRegex=${{ matrix.benchmark }} -PjmhOutputPath=benchmark/${{ matrix.benchmark }}.txt -PjmhJsonOutputPath=benchmark/${{ matrix.benchmark }}.json - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: ${{ always() }} with: name: benchmark-${{ matrix.benchmark }} diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index d346e238a6c4..fd55efba97c5 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -109,7 +109,7 @@ jobs: :iceberg-spark:iceberg-spark-extensions-${{ matrix.spark }}_${{ matrix.scala }}:check \ :iceberg-spark:iceberg-spark-runtime-${{ matrix.spark }}_${{ matrix.scala }}:check \ -Pquick=true -x javadoc - - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 if: failure() with: name: test logs diff --git a/spark/v4.1/spark-runtime/runtime-deps.txt b/spark/v4.1/spark-runtime/runtime-deps.txt index a662942f9cd1..e275e24372af 100644 --- a/spark/v4.1/spark-runtime/runtime-deps.txt +++ b/spark/v4.1/spark-runtime/runtime-deps.txt @@ -1,21 +1,14 @@ -com.aliyun:credentials-java:0.3.12 -com.aliyun:tea:1.4.1 com.fasterxml.jackson.core:jackson-annotations:2.21 com.fasterxml.jackson.core:jackson-core:2.15.2 com.fasterxml.jackson.core:jackson-databind:2.15.2 com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.2 com.github.ben-manes.caffeine:caffeine:2.9.3 -com.google.code.gson:gson:2.11.0 -com.google.errorprone:error_prone_annotations:2.27.0 +com.google.errorprone:error_prone_annotations:2.10.0 com.google.flatbuffers:flatbuffers-java:23.5.26 -com.squareup.okhttp3:okhttp:4.12.0 -com.squareup.okio:okio-jvm:3.6.0 -com.sun.xml.bind:jaxb-core:2.3.0 -com.sun.xml.bind:jaxb-impl:2.3.0 dev.failsafe:failsafe:3.3.2 io.airlift:aircompressor:2.0.3 -io.netty:netty-buffer:4.2.10.Final -io.netty:netty-common:4.2.10.Final +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final org.apache.arrow:arrow-format:15.0.2 org.apache.arrow:arrow-memory-core:15.0.2 org.apache.arrow:arrow-memory-netty:15.0.2 @@ -40,11 +33,6 @@ org.checkerframework:checker-qual:3.19.0 org.eclipse.collections:eclipse-collections-api:11.1.0 org.eclipse.collections:eclipse-collections:11.1.0 org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 -org.jacoco:org.jacoco.agent:0.8.8 -org.jetbrains.kotlin:kotlin-stdlib-common:1.9.10 -org.jetbrains.kotlin:kotlin-stdlib-jdk7:1.9.10 -org.jetbrains.kotlin:kotlin-stdlib-jdk8:1.9.10 -org.jetbrains.kotlin:kotlin-stdlib:1.9.10 org.locationtech.jts:jts-core:1.20.0 org.projectnessie.nessie:nessie-client:0.107.4 org.projectnessie.nessie:nessie-model:0.107.4 From 9242be6ddbae41965afb5ed5f1fbf6be163e3700 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Sat, 11 Apr 2026 03:08:35 +0200 Subject: [PATCH 029/197] Revert "Build: bump shadow-gradle-plugin to 9.4.1 (#15835)" (#15941) This reverts commit 9a939d68358de9dac2c6ba9b236b675ebe477490. --- build.gradle | 2 +- deploy.gradle | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.gradle b/build.gradle index 6200d53a172e..306da2abf27a 100644 --- a/build.gradle +++ b/build.gradle @@ -26,7 +26,7 @@ buildscript { gradlePluginPortal() } dependencies { - classpath 'com.gradleup.shadow:shadow-gradle-plugin:9.4.1' + classpath 'com.gradleup.shadow:shadow-gradle-plugin:8.3.10' classpath 'com.palantir.baseline:gradle-baseline-java:6.90.0' classpath 'com.diffplug.spotless:spotless-plugin-gradle:8.4.0' classpath 'gradle.plugin.org.inferred:gradle-processors:3.7.0' diff --git a/deploy.gradle b/deploy.gradle index e1f26ec3e416..740d0056273b 100644 --- a/deploy.gradle +++ b/deploy.gradle @@ -80,7 +80,7 @@ subprojects { if (tasks.matching({task -> task.name == 'shadowJar'}).isEmpty()) { from components.java } else { - from components.shadow + project.shadow.component(it) } artifact sourceJar From 7e4aa89d9900a52620afd1456152b63b47f2223b Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Sat, 11 Apr 2026 16:04:19 +0200 Subject: [PATCH 030/197] AWS, Core: Switch Jetty to use new Compression API for GZIP (#15043) --- .../aws/s3/signer/TestS3RestSigner.java | 8 +++++--- build.gradle | 8 ++++++-- .../iceberg/rest/TestBaseWithRESTServer.java | 19 ++++++++++++++++--- .../rest/TestFreshnessAwareLoading.java | 5 +++++ .../apache/iceberg/rest/TestRESTCatalog.java | 8 +++++--- .../iceberg/rest/TestRESTViewCatalog.java | 8 +++++--- ...RESTViewCatalogWithAssumedViewSupport.java | 8 +++++--- gradle/libs.versions.toml | 3 ++- .../iceberg/rest/RESTCatalogServer.java | 8 +++++--- 9 files changed, 54 insertions(+), 21 deletions(-) diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java index f09360915725..d229976d5157 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java @@ -37,10 +37,11 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.auth.OAuth2Properties; import org.apache.iceberg.util.ThreadPools; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; import org.eclipse.jetty.ee10.servlet.ServletContextHandler; import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -181,7 +182,6 @@ public void before() throws Exception { CreateMultipartUploadRequest.builder().bucket(BUCKET).key("random/multipart-key").build()); } - @SuppressWarnings("removal") private static Server initHttpServer() throws Exception { S3SignerServlet.SignRequestValidator deleteObjectsWithBody = new S3SignerServlet.SignRequestValidator( @@ -195,7 +195,9 @@ private static Server initHttpServer() throws Exception { ServletContextHandler servletContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.addServlet(new ServletHolder(servlet), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); Server server = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); server.setHandler(servletContext); diff --git a/build.gradle b/build.gradle index 306da2abf27a..b0dede64eac8 100644 --- a/build.gradle +++ b/build.gradle @@ -396,7 +396,8 @@ project(':iceberg-core') { testImplementation libs.jetty.servlet testImplementation libs.jakarta.servlet - testImplementation libs.jetty.server + testImplementation libs.jetty.compression.server + testImplementation libs.jetty.compression.gzip testImplementation libs.mockserver.netty testImplementation libs.mockserver.client.java testImplementation libs.sqlite.jdbc @@ -545,6 +546,8 @@ project(':iceberg-aws') { testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation libs.awaitility testImplementation libs.jetty.servlet + testImplementation libs.jetty.compression.server + testImplementation libs.jetty.compression.gzip testImplementation libs.mockito.junit.jupiter } @@ -1103,7 +1106,8 @@ project(':iceberg-open-api') { testFixturesImplementation libs.slf4j.simple testFixturesImplementation libs.jetty.servlet - testFixturesImplementation libs.jetty.server + testFixturesImplementation libs.jetty.compression.server + testFixturesImplementation libs.jetty.compression.gzip testFixturesImplementation libs.sqlite.jdbc testFixturesCompileOnly libs.apiguardian diff --git a/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java b/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java index c386ecf60f67..9cab8b1f240e 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestBaseWithRESTServer.java @@ -34,10 +34,11 @@ import org.apache.iceberg.inmemory.InMemoryCatalog; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.responses.ErrorResponse; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; import org.eclipse.jetty.ee10.servlet.ServletContextHandler; import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.io.TempDir; @@ -61,7 +62,15 @@ public abstract class TestBaseWithRESTServer { @TempDir private Path temp; - @SuppressWarnings("removal") + /** + * GZIP responses interfere with freshness-aware loading tests that assert on {@code ETag} and + * conditional requests. Subclasses may disable HTTP compression while keeping the default for + * other REST catalog tests. + */ + protected boolean useHttpCompression() { + return true; + } + @BeforeEach public void before() throws Exception { File warehouse = temp.toFile(); @@ -76,7 +85,11 @@ public void before() throws Exception { new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.addServlet( new ServletHolder(new RESTCatalogServlet(adapterForRESTServer)), "/*"); - servletContext.setHandler(new GzipHandler()); + if (useHttpCompression()) { + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); + } this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java b/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java index 80981df1fcb3..a4bb170d1411 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestFreshnessAwareLoading.java @@ -67,6 +67,11 @@ import org.mockito.stubbing.Answer; public class TestFreshnessAwareLoading extends TestBaseWithRESTServer { + @Override + protected boolean useHttpCompression() { + return false; + } + private static final ResourcePaths RESOURCE_PATHS = ResourcePaths.forCatalogProperties( ImmutableMap.of( diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java index f6050d133313..c1630451a33e 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java @@ -120,10 +120,11 @@ import org.apache.iceberg.util.Pair; import org.assertj.core.api.InstanceOfAssertFactories; import org.awaitility.Awaitility; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; import org.eclipse.jetty.ee10.servlet.ServletContextHandler; import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -271,7 +272,6 @@ protected T execute( private Server httpServer; private HeaderValidatingAdapter adapterForRESTServer; - @SuppressWarnings("removal") @BeforeEach public void createCatalog() throws Exception { File warehouse = temp.toFile(); @@ -303,7 +303,9 @@ public void createCatalog() throws Exception { new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.addServlet( new ServletHolder(new RESTCatalogServlet(adapterForRESTServer)), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java index f02ab2b9bbd4..24450949df5f 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalog.java @@ -57,10 +57,11 @@ import org.apache.iceberg.rest.responses.LoadViewResponse; import org.apache.iceberg.view.ViewCatalogTests; import org.apache.iceberg.view.ViewMetadata; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; import org.eclipse.jetty.ee10.servlet.ServletContextHandler; import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -78,7 +79,6 @@ public class TestRESTViewCatalog extends ViewCatalogTests { protected InMemoryCatalog backendCatalog; protected Server httpServer; - @SuppressWarnings("removal") @BeforeEach public void createCatalog() throws Exception { File warehouse = temp.toFile(); @@ -115,7 +115,9 @@ public T execute( new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.setContextPath("/"); servletContext.addServlet(new ServletHolder(new RESTCatalogServlet(adaptor)), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java index fa999e803325..3f3d7ba77493 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTViewCatalogWithAssumedViewSupport.java @@ -31,15 +31,15 @@ import org.apache.iceberg.inmemory.InMemoryCatalog; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.rest.responses.ConfigResponse; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; import org.eclipse.jetty.ee10.servlet.ServletContextHandler; import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Server; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; import org.junit.jupiter.api.BeforeEach; public class TestRESTViewCatalogWithAssumedViewSupport extends TestRESTViewCatalog { - @SuppressWarnings("removal") @BeforeEach public void createCatalog() throws Exception { File warehouse = temp.toFile(); @@ -72,7 +72,9 @@ public T handleRequest( new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.setContextPath("/"); servletContext.addServlet(new ServletHolder(new RESTCatalogServlet(adaptor)), "/*"); - servletContext.setHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + servletContext.insertHandler(compressionHandler); this.httpServer = new Server(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0)); httpServer.setHandler(servletContext); diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 4f04662ba60b..6387c129b120 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -202,7 +202,8 @@ flink21-test-utilsjunit = { module = "org.apache.flink:flink-test-utils-junit", guava-testlib = { module = "com.google.guava:guava-testlib", version.ref = "guava" } jakarta-el-api = { module = "jakarta.el:jakarta.el-api", version.ref = "jakarta-el-api" } jakarta-servlet = {module = "jakarta.servlet:jakarta.servlet-api", version.ref = "jakarta-servlet-api"} -jetty-server = { module = "org.eclipse.jetty:jetty-server", version.ref = "jetty" } +jetty-compression-server = { module = "org.eclipse.jetty.compression:jetty-compression-server", version.ref = "jetty" } +jetty-compression-gzip = { module = "org.eclipse.jetty.compression:jetty-compression-gzip", version.ref = "jetty" } jetty-servlet = { module = "org.eclipse.jetty.ee10:jetty-ee10-servlet", version.ref = "jetty" } junit-jupiter = { module = "org.junit.jupiter:junit-jupiter", version.ref = "junit" } junit-jupiter-engine = { module = "org.junit.jupiter:junit-jupiter-engine", version.ref = "junit" } diff --git a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java index 34d8761a902b..2e4541b50b33 100644 --- a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java +++ b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RESTCatalogServer.java @@ -28,12 +28,13 @@ import org.apache.iceberg.jdbc.JdbcCatalog; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.PropertyUtil; +import org.eclipse.jetty.compression.gzip.GzipCompression; +import org.eclipse.jetty.compression.server.CompressionHandler; import org.eclipse.jetty.ee10.servlet.ServletContextHandler; import org.eclipse.jetty.ee10.servlet.ServletHolder; import org.eclipse.jetty.server.Connector; import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.ServerConnector; -import org.eclipse.jetty.server.handler.gzip.GzipHandler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -106,7 +107,6 @@ private CatalogContext initializeBackendCatalog() throws IOException { catalogProperties); } - @SuppressWarnings("removal") public void start(boolean join) throws Exception { CatalogContext catalogContext = initializeBackendCatalog(); @@ -116,7 +116,9 @@ public void start(boolean join) throws Exception { ServletContextHandler context = new ServletContextHandler(ServletContextHandler.NO_SESSIONS); ServletHolder servletHolder = new ServletHolder(servlet); context.addServlet(servletHolder, "/*"); - context.insertHandler(new GzipHandler()); + CompressionHandler compressionHandler = new CompressionHandler(); + compressionHandler.putCompression(new GzipCompression()); + context.insertHandler(compressionHandler); this.httpServer = new Server( From b156f3414e402aa4c0aea5eaa397f34da23e4a05 Mon Sep 17 00:00:00 2001 From: Dhruv Arya Date: Sat, 11 Apr 2026 20:06:13 -0700 Subject: [PATCH 031/197] pass dockerhub token the safely (#15940) Co-authored-by: Dhruv Arya --- .github/workflows/publish-iceberg-rest-fixture-docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-iceberg-rest-fixture-docker.yml b/.github/workflows/publish-iceberg-rest-fixture-docker.yml index 9504ae51bcd7..8691a67e29e8 100644 --- a/.github/workflows/publish-iceberg-rest-fixture-docker.yml +++ b/.github/workflows/publish-iceberg-rest-fixture-docker.yml @@ -56,7 +56,7 @@ jobs: DOCKERHUB_USER: ${{ secrets.DOCKERHUB_USER }} DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} run: | - docker login -u "$DOCKERHUB_USER" -p "$DOCKERHUB_TOKEN" + echo "$DOCKERHUB_TOKEN" | docker login --username "$DOCKERHUB_USER" --password-stdin - name: Set the tagged version # for tag 'apache-iceberg-1.7.1', publish image 'apache/iceberg-rest-fixture:1.7.1' if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') From e4d15333279594e95869ae8636764cd5f3b51832 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Sun, 12 Apr 2026 07:46:31 +0200 Subject: [PATCH 032/197] API: Include size unit in avg/max value size fields (#15939) --- .../java/org/apache/iceberg/stats/FieldStatistic.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java index 72058e5253ab..e3fd26a927fa 100644 --- a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java +++ b/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java @@ -29,8 +29,8 @@ public enum FieldStatistic { VALUE_COUNT(1, "value_count"), NULL_VALUE_COUNT(2, "null_value_count"), NAN_VALUE_COUNT(3, "nan_value_count"), - AVG_VALUE_SIZE(4, "avg_value_size"), - MAX_VALUE_SIZE(5, "max_value_size"), + AVG_VALUE_SIZE(4, "avg_value_size_in_bytes"), + MAX_VALUE_SIZE(5, "max_value_size_in_bytes"), LOWER_BOUND(6, "lower_bound"), UPPER_BOUND(7, "upper_bound"), EXACT_BOUNDS(8, "exact_bounds"); @@ -125,13 +125,13 @@ public static Types.StructType fieldStatsFor(Types.NestedField field, int baseFi baseFieldId + AVG_VALUE_SIZE.offset(), AVG_VALUE_SIZE.fieldName(), Types.IntegerType.get(), - "Avg value size of variable-length types (String, Binary)")); + "Avg value size in bytes of variable-length types (String, Binary)")); fields.add( optional( baseFieldId + MAX_VALUE_SIZE.offset(), MAX_VALUE_SIZE.fieldName(), Types.IntegerType.get(), - "Max value size of variable-length types (String, Binary)")); + "Max value size in bytes of variable-length types (String, Binary)")); } fields.add( From 5bfabc307a30225359083fe123966582c89879f4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 23:29:59 -0700 Subject: [PATCH 033/197] Build: Bump datamodel-code-generator from 0.55.0 to 0.56.0 (#15949) Bumps [datamodel-code-generator](https://github.com/koxudaxi/datamodel-code-generator) from 0.55.0 to 0.56.0. - [Release notes](https://github.com/koxudaxi/datamodel-code-generator/releases) - [Changelog](https://github.com/koxudaxi/datamodel-code-generator/blob/main/CHANGELOG.md) - [Commits](https://github.com/koxudaxi/datamodel-code-generator/compare/0.55.0...0.56.0) --- updated-dependencies: - dependency-name: datamodel-code-generator dependency-version: 0.56.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- open-api/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open-api/requirements.txt b/open-api/requirements.txt index 4076246c83cf..1e19f4b303a7 100644 --- a/open-api/requirements.txt +++ b/open-api/requirements.txt @@ -16,5 +16,5 @@ # under the License. openapi-spec-validator==0.8.4 -datamodel-code-generator==0.55.0 +datamodel-code-generator==0.56.0 yamllint==1.38.0 From 3a6863d2ff7493385fac5530677fb4078e366b82 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 23:30:14 -0700 Subject: [PATCH 034/197] Build: Bump jetty from 12.1.7 to 12.1.8 (#15951) Bumps `jetty` from 12.1.7 to 12.1.8. Updates `org.eclipse.jetty.compression:jetty-compression-server` from 12.1.7 to 12.1.8 Updates `org.eclipse.jetty.compression:jetty-compression-gzip` from 12.1.7 to 12.1.8 Updates `org.eclipse.jetty.ee10:jetty-ee10-servlet` from 12.1.7 to 12.1.8 --- updated-dependencies: - dependency-name: org.eclipse.jetty.compression:jetty-compression-server dependency-version: 12.1.8 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.eclipse.jetty.compression:jetty-compression-gzip dependency-version: 12.1.8 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.eclipse.jetty.ee10:jetty-ee10-servlet dependency-version: 12.1.8 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 6387c129b120..4cd19486c831 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -65,7 +65,7 @@ jakarta-el-api = "3.0.3" jakarta-servlet-api = "6.1.0" jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" -jetty = "12.1.7" +jetty = "12.1.8" junit = "5.14.3" junit-platform = "1.14.3" junit-pioneer = "2.3.0" From 72c993c7ee50b3dc4b1b056080e8f7e7d0596540 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 11 Apr 2026 23:30:31 -0700 Subject: [PATCH 035/197] Build: Bump software.amazon.awssdk:bom from 2.42.23 to 2.42.28 (#15952) Bumps software.amazon.awssdk:bom from 2.42.23 to 2.42.28. --- updated-dependencies: - dependency-name: software.amazon.awssdk:bom dependency-version: 2.42.28 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 4cd19486c831..776b6c88f597 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,7 +33,7 @@ arrow = "15.0.2" avro = "1.12.1" assertj-core = "3.27.7" awaitility = "4.3.0" -awssdk-bom = "2.42.23" +awssdk-bom = "2.42.28" azuresdk-bom = "1.3.5" awssdk-s3accessgrants = "2.4.1" bson-ver = "4.11.5" From beef60d95e94902eaaf2fac031194f6b2e038c6c Mon Sep 17 00:00:00 2001 From: Govindarajan Date: Mon, 13 Apr 2026 19:55:27 +0530 Subject: [PATCH 036/197] API: Fix TableIdentifier.toLowerCase to use Locale.ROOT for namespace levels (#15956) (#15958) --- .../java/org/apache/iceberg/catalog/TableIdentifier.java | 4 +++- .../org/apache/iceberg/catalog/TestTableIdentifier.java | 8 ++++++++ build.gradle | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java b/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java index cbb5dc8d8fd2..9b9fbdcbb0b7 100644 --- a/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java +++ b/api/src/main/java/org/apache/iceberg/catalog/TableIdentifier.java @@ -80,7 +80,9 @@ public String name() { public TableIdentifier toLowerCase() { String[] newLevels = - Arrays.stream(namespace().levels()).map(String::toLowerCase).toArray(String[]::new); + Arrays.stream(namespace().levels()) + .map(s -> s.toLowerCase(Locale.ROOT)) + .toArray(String[]::new); String newName = name().toLowerCase(Locale.ROOT); return TableIdentifier.of(Namespace.of(newLevels), newName); } diff --git a/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java b/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java index ca9569436bab..13781ccaa7f4 100644 --- a/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java +++ b/api/src/test/java/org/apache/iceberg/catalog/TestTableIdentifier.java @@ -22,6 +22,7 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; import org.junit.jupiter.api.Test; +import org.junitpioneer.jupiter.DefaultLocale; public class TestTableIdentifier { @@ -52,6 +53,13 @@ public void testToLowerCase() { .isEqualTo(TableIdentifier.of("Catalog", "dB", "TBL").toLowerCase()); } + @Test + @DefaultLocale(language = "tr") + public void testToLowerCaseIsLocaleIndependent() { + assertThat(TableIdentifier.of("information", "db", "tbl")) + .isEqualTo(TableIdentifier.of("INFORMATION", "DB", "TBL").toLowerCase()); + } + @Test public void testInvalidTableName() { assertThatThrownBy(() -> TableIdentifier.of(Namespace.empty(), "")) diff --git a/build.gradle b/build.gradle index b0dede64eac8..0715c3f6cb9e 100644 --- a/build.gradle +++ b/build.gradle @@ -334,6 +334,7 @@ project(':iceberg-api') { testImplementation libs.avro.avro testImplementation libs.esotericsoftware.kryo testImplementation libs.awaitility + testImplementation libs.junit.pioneer } tasks.processTestResources.dependsOn rootProject.tasks.buildInfo From 0ed7f7791f5179a924f8c7772428587c1d8cbdfe Mon Sep 17 00:00:00 2001 From: genxiong7 Date: Tue, 14 Apr 2026 00:33:16 +0800 Subject: [PATCH 037/197] Flink: Fix checkArgument message for flink streaming (#15907) --- .../apache/iceberg/flink/source/ScanContext.java | 15 +++++++++------ .../iceberg/flink/source/TestScanContext.java | 9 +++++---- .../apache/iceberg/flink/source/ScanContext.java | 15 +++++++++------ .../iceberg/flink/source/TestScanContext.java | 9 +++++---- .../apache/iceberg/flink/source/ScanContext.java | 15 +++++++++------ .../iceberg/flink/source/TestScanContext.java | 9 +++++---- 6 files changed, 42 insertions(+), 30 deletions(-) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index bac7c05bdfef..1c6644238c3d 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -136,23 +136,26 @@ void validate() { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { Preconditions.checkArgument( startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + "Invalid starting snapshot id for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for %s strategy: not null", + startingStrategy); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { Preconditions.checkArgument( startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot id for %s strategy: not null", + startingStrategy); } Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + tag == null, "Cannot scan table using ref %s configured for streaming reader", tag); Preconditions.checkArgument( snapshotId == null, "Cannot set snapshot-id option for streaming reader"); Preconditions.checkArgument( diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java index 5dd7de545e11..09639a8a9568 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -31,7 +31,7 @@ void testIncrementalFromSnapshotId() { .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + context, "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_ID strategy: null"); context = ScanContext.builder() @@ -42,7 +42,7 @@ void testIncrementalFromSnapshotId() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_ID strategy: not null"); } @Test @@ -54,7 +54,7 @@ void testIncrementalFromSnapshotTimestamp() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: null"); context = ScanContext.builder() @@ -64,7 +64,8 @@ void testIncrementalFromSnapshotTimestamp() { .startSnapshotTimestamp(1L) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + context, + "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: not null"); } @Test diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index bac7c05bdfef..1c6644238c3d 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -136,23 +136,26 @@ void validate() { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { Preconditions.checkArgument( startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + "Invalid starting snapshot id for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for %s strategy: not null", + startingStrategy); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { Preconditions.checkArgument( startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot id for %s strategy: not null", + startingStrategy); } Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + tag == null, "Cannot scan table using ref %s configured for streaming reader", tag); Preconditions.checkArgument( snapshotId == null, "Cannot set snapshot-id option for streaming reader"); Preconditions.checkArgument( diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java index 5dd7de545e11..09639a8a9568 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -31,7 +31,7 @@ void testIncrementalFromSnapshotId() { .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + context, "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_ID strategy: null"); context = ScanContext.builder() @@ -42,7 +42,7 @@ void testIncrementalFromSnapshotId() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_ID strategy: not null"); } @Test @@ -54,7 +54,7 @@ void testIncrementalFromSnapshotTimestamp() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: null"); context = ScanContext.builder() @@ -64,7 +64,8 @@ void testIncrementalFromSnapshotTimestamp() { .startSnapshotTimestamp(1L) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + context, + "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: not null"); } @Test diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java index bac7c05bdfef..1c6644238c3d 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/ScanContext.java @@ -136,23 +136,26 @@ void validate() { if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) { Preconditions.checkArgument( startSnapshotId != null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + "Invalid starting snapshot id for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotTimestamp == null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for %s strategy: not null", + startingStrategy); } if (startingStrategy == StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP) { Preconditions.checkArgument( startSnapshotTimestamp != null, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for %s strategy: null", + startingStrategy); Preconditions.checkArgument( startSnapshotId == null, - "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot id for %s strategy: not null", + startingStrategy); } Preconditions.checkArgument( - tag == null, - String.format("Cannot scan table using ref %s configured for streaming reader", tag)); + tag == null, "Cannot scan table using ref %s configured for streaming reader", tag); Preconditions.checkArgument( snapshotId == null, "Cannot set snapshot-id option for streaming reader"); Preconditions.checkArgument( diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java index 5dd7de545e11..09639a8a9568 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestScanContext.java @@ -31,7 +31,7 @@ void testIncrementalFromSnapshotId() { .startingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_SNAPSHOT_ID) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: null"); + context, "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_ID strategy: null"); context = ScanContext.builder() @@ -42,7 +42,7 @@ void testIncrementalFromSnapshotId() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_ID strategy: not null"); } @Test @@ -54,7 +54,7 @@ void testIncrementalFromSnapshotTimestamp() { .build(); assertException( context, - "Invalid starting snapshot timestamp for SPECIFIC_START_SNAPSHOT_TIMESTAMP strategy: null"); + "Invalid starting snapshot timestamp for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: null"); context = ScanContext.builder() @@ -64,7 +64,8 @@ void testIncrementalFromSnapshotTimestamp() { .startSnapshotTimestamp(1L) .build(); assertException( - context, "Invalid starting snapshot id for SPECIFIC_START_SNAPSHOT_ID strategy: not null"); + context, + "Invalid starting snapshot id for INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP strategy: not null"); } @Test From 56092bc8cbfa19380cdd4513fc707c6d97ef8e36 Mon Sep 17 00:00:00 2001 From: Neelesh Salian Date: Mon, 13 Apr 2026 10:44:37 -0700 Subject: [PATCH 038/197] Parquet: Fix NPE in ParquetAvroWriter when schema contains variant type (#15934) * Fix NPE in ParquetAvroWriter * Update error message check in test * PR comments --- .../iceberg/parquet/ParquetAvroWriter.java | 5 ++++ .../apache/iceberg/parquet/TestParquet.java | 24 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java index 114e5fe27545..75b3a6604084 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetAvroWriter.java @@ -103,6 +103,11 @@ public ParquetValueWriter map( ParquetValueWriters.option(valueType, valueD, valueWriter)); } + @Override + public ParquetValueWriter variant(GroupType variant) { + throw new UnsupportedOperationException("Avro writer does not support variant types"); + } + @Override public ParquetValueWriter primitive(PrimitiveType primitive) { ColumnDescriptor desc = type.getColumnDescription(currentPath()); diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java index 58850ec7c9f4..5f1e0c83cc0f 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java @@ -30,6 +30,7 @@ import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -57,6 +58,7 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.IntegerType; import org.apache.iceberg.util.Pair; +import org.apache.iceberg.variants.Variant; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.column.statistics.Statistics; import org.apache.parquet.hadoop.ParquetFileReader; @@ -64,7 +66,9 @@ import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.parquet.io.LocalOutputFile; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @@ -314,6 +318,26 @@ public void testFooterMetricsWithNameMappingForFileWithoutIds() throws IOExcepti } } + @Test + public void testAvroWriterRejectsVariantType() { + MessageType schema = + org.apache.parquet.schema.Types.buildMessage() + .optional(PrimitiveTypeName.INT32) + .named("id") + .optionalGroup() + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveTypeName.BINARY) + .named("metadata") + .required(PrimitiveTypeName.BINARY) + .named("value") + .named("v") + .named("table"); + + assertThatThrownBy(() -> ParquetAvroWriter.buildWriter(schema)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessage("Avro writer does not support variant types"); + } + private Pair generateFile( Function> createWriterFunc, int desiredRecordCount, From 3f18cd4f69a2110327161972646da226fbbecc13 Mon Sep 17 00:00:00 2001 From: kumarpritam863 <148938310+kumarpritam863@users.noreply.github.com> Date: Tue, 14 Apr 2026 02:32:10 +0530 Subject: [PATCH 039/197] Kafka Connect: Fix source offset tracking when SMTs modify the record topic (#15880) Fix source offset tracking when SMTs modify the record topic --------- Co-authored-by: Pritam Kumar Mishra --- .../connect/TestIntegrationDynamicTable.java | 53 +++++++++++++++ .../iceberg/connect/data/SinkWriter.java | 7 +- .../iceberg/connect/data/TestSinkWriter.java | 68 +++++++++++++++++++ 3 files changed, 126 insertions(+), 2 deletions(-) diff --git a/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java b/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java index 65bbcde9dfed..1d3d71a54152 100644 --- a/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java +++ b/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/TestIntegrationDynamicTable.java @@ -20,11 +20,14 @@ import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; import java.time.Instant; import java.util.List; import org.apache.iceberg.DataFile; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.NullSource; import org.junit.jupiter.params.provider.ValueSource; @@ -59,6 +62,56 @@ public void testIcebergSink(String branch) { assertSnapshotProps(TABLE_IDENTIFIER2, branch); } + /** + * Verifies dynamic routing works when topic-rewriting SMTs (e.g. RegexRouter) change + * record.topic(). Before the fix, SinkWriter tracked offsets under the rewritten topic, causing a + * mismatch with context.assignment() and preventing proper offset commits. + */ + @Test + public void testDynamicRouteWithTopicRewritingSMT() { + String smtTable = "smttbl"; + TableIdentifier smtTableId = TableIdentifier.of(TEST_DB, smtTable); + catalog().createTable(smtTableId, TestEvent.TEST_SCHEMA); + + try { + // RegexRouter rewrites topic to "test.smttbl", then InsertField copies + // record.topic() (now "test.smttbl") into field "srcTopic", and dynamic + // routing uses "srcTopic" to pick the destination table. + KafkaConnectUtils.Config connectorConfig = + createCommonConfig(false) + .config("iceberg.tables.dynamic-enabled", true) + .config("iceberg.tables.route-field", "srcTopic") + .config("transforms", "rewriteTopic,insertTopic") + .config( + "transforms.rewriteTopic.type", "org.apache.kafka.connect.transforms.RegexRouter") + .config("transforms.rewriteTopic.regex", ".*") + .config("transforms.rewriteTopic.replacement", TEST_DB + "." + smtTable) + .config( + "transforms.insertTopic.type", + "org.apache.kafka.connect.transforms.InsertField$Value") + .config("transforms.insertTopic.topic.field", "srcTopic"); + + context().connectorCatalogProperties().forEach(connectorConfig::config); + context().startConnector(connectorConfig); + + send(testTopic(), new TestEvent(1, "type1", Instant.now(), "hello"), false); + send(testTopic(), new TestEvent(2, "type2", Instant.now(), "world"), false); + flush(); + + Awaitility.await() + .atMost(Duration.ofSeconds(30)) + .pollInterval(Duration.ofSeconds(1)) + .untilAsserted(() -> assertSnapshotAdded(List.of(smtTableId))); + + List files = dataFiles(smtTableId, null); + assertThat(files).hasSizeBetween(1, 2); + assertThat(files.stream().mapToLong(DataFile::recordCount).sum()).isEqualTo(2); + assertSnapshotProps(smtTableId, null); + } finally { + catalog().dropTable(smtTableId); + } + } + @Override protected KafkaConnectUtils.Config createConfig(boolean useSchema) { return createCommonConfig(useSchema) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java index f81155e13777..48a01881935b 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java @@ -75,9 +75,12 @@ private void save(SinkRecord record) { record.timestamp() == null ? null : OffsetDateTime.ofInstant(Instant.ofEpochMilli(record.timestamp()), ZoneOffset.UTC); + // use the original topic and partition to track offsets, as SMTs may have changed + // record.topic() and record.kafkaPartition() (e.g. RegexRouter). The framework's + // context.assignment() and consumer offset management use the original values. sourceOffsets.put( - new TopicPartition(record.topic(), record.kafkaPartition()), - new Offset(record.kafkaOffset() + 1, timestamp)); + new TopicPartition(record.originalTopic(), record.originalKafkaPartition()), + new Offset(record.originalKafkaOffset() + 1, timestamp)); if (config.dynamicTablesEnabled()) { routeRecordDynamically(record); diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java index 6baf72117d04..09f7a373d5f2 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestSinkWriter.java @@ -153,6 +153,74 @@ public void testDynamicRoute() { assertThat(writerResult.tableReference().identifier()).isEqualTo(TABLE_IDENTIFIER); } + @Test + public void testOffsetTrackedByOriginalTopicPartition() { + IcebergSinkConfig config = mock(IcebergSinkConfig.class); + when(config.tableConfig(any())).thenReturn(mock(TableSinkConfig.class)); + when(config.tables()).thenReturn(ImmutableList.of(TABLE_IDENTIFIER.toString())); + when(config.dynamicTablesEnabled()).thenReturn(true); + when(config.tablesRouteField()).thenReturn(ROUTE_FIELD); + + IcebergWriterResult writeResult = + new IcebergWriterResult( + TableIdentifier.parse(TABLE_NAME), + ImmutableList.of(mock(DataFile.class)), + ImmutableList.of(), + Types.StructType.of()); + IcebergWriter writer = mock(IcebergWriter.class); + when(writer.complete()).thenReturn(ImmutableList.of(writeResult)); + + IcebergWriterFactory writerFactory = mock(IcebergWriterFactory.class); + when(writerFactory.createWriter(any(), any(), anyBoolean())).thenReturn(writer); + + SinkWriter sinkWriter = new SinkWriter(catalog, config); + + // simulate a record that has been transformed by RegexRouter (topic changed) + String originalTopic = "orders"; + int originalPartition = 0; + long originalOffset = 42L; + Instant now = Instant.now().truncatedTo(ChronoUnit.MILLIS); + + SinkRecord original = + new SinkRecord( + originalTopic, + originalPartition, + null, + "key", + null, + ImmutableMap.of(ROUTE_FIELD, TABLE_IDENTIFIER.toString()), + originalOffset, + now.toEpochMilli(), + TimestampType.LOG_APPEND_TIME); + + // RegexRouter changes the topic via newRecord + String transformedTopic = "tmp.dynamic_orders"; + SinkRecord transformed = + original.newRecord( + transformedTopic, + originalPartition, + original.keySchema(), + original.key(), + original.valueSchema(), + original.value(), + original.timestamp()); + + sinkWriter.save(ImmutableList.of(transformed)); + SinkWriterResult result = sinkWriter.completeWrite(); + + // offsets must be keyed by the ORIGINAL topic, not the transformed one + Offset offset = + result.sourceOffsets().get(new TopicPartition(originalTopic, originalPartition)); + assertThat(offset).isNotNull(); + assertThat(offset.offset()).isEqualTo(originalOffset + 1); + assertThat(offset.timestamp()).isEqualTo(now.atOffset(ZoneOffset.UTC)); + + // the transformed topic key should NOT be present + Offset wrongOffset = + result.sourceOffsets().get(new TopicPartition(transformedTopic, originalPartition)); + assertThat(wrongOffset).isNull(); + } + @Test public void testDynamicNoRoute() { IcebergSinkConfig config = mock(IcebergSinkConfig.class); From 2a6f127842a8f21e8e16efed45fdd5d538af29ae Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Tue, 14 Apr 2026 07:11:36 +0900 Subject: [PATCH 040/197] Core: Expose MetricsConfig.from method with 3-parameter version (#15819) --- core/src/main/java/org/apache/iceberg/MetricsConfig.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/MetricsConfig.java b/core/src/main/java/org/apache/iceberg/MetricsConfig.java index 593dbc570b8a..2b55bcbeab22 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsConfig.java +++ b/core/src/main/java/org/apache/iceberg/MetricsConfig.java @@ -223,7 +223,7 @@ public Set map( * @param order sort order columns, will be promoted to truncate(16) * @return metrics configuration */ - private static MetricsConfig from(Map props, Schema schema, SortOrder order) { + public static MetricsConfig from(Map props, Schema schema, SortOrder order) { int maxInferredDefaultColumns = maxInferredColumnDefaults(props); Map columnModes = Maps.newHashMap(); From 5d1840caed05ec8914fb8866378c36616d73debb Mon Sep 17 00:00:00 2001 From: XL Liang Date: Tue, 14 Apr 2026 09:35:39 +0800 Subject: [PATCH 041/197] Docs: Add Sail to integration and vendor (#15920) * Docs: Add Sail to integration and vendor * update link --- site/docs/vendors.md | 4 ++++ site/mkdocs-dev.yml | 1 + site/nav.yml | 1 + 3 files changed, 6 insertions(+) diff --git a/site/docs/vendors.md b/site/docs/vendors.md index 67a98a3c23a2..4260553e96cd 100644 --- a/site/docs/vendors.md +++ b/site/docs/vendors.md @@ -157,6 +157,10 @@ Redpanda is both a cloud-native and self-hosted streaming platform whose [Iceber [Ryft](https://ryft.io/) is a fully automated Iceberg management platform. Ryft helps data teams create an open, automated and cost-effective Iceberg lakehouse, by maintaining and optimizing Iceberg tables in real time, based on actual usage patterns. The Ryft engine runs compaction intelligently, adapting to different use cases like streaming, batch jobs, CDC, and more. Ryft also automates compliance, disaster recovery and data lifecycle management for Iceberg tables, to ensure your lakehouse stays secure and compliant. It directly integrates with your existing catalog, storage and query engines, allowing for a very simple deployment. +### [Sail](https://lakesail.com/) + +[Sail](https://github.com/lakehq/sail) is an open-source multimodal distributed compute framework, built in Rust, unifying batch, streaming, and AI workloads. For seamless adoption, Sail offers a drop-in replacement for the Spark SQL and DataFrame APIs in both single-host and distributed settings. Learn more about using Sail with Iceberg in the [Sail Iceberg guide](https://docs.lakesail.com/sail/latest/guide/sources/iceberg). + ### [SingleStore](https://singlestore.com/) SingleStore is a high‑performance, scalable, distributed SQL platform that makes real‑time analytics and transactional processing available at scale. Its native Apache Iceberg integration removes costly ETL steps and powers intelligent, millisecond‑response applications. diff --git a/site/mkdocs-dev.yml b/site/mkdocs-dev.yml index eb5b34c0b274..b4c68aacc3fc 100644 --- a/site/mkdocs-dev.yml +++ b/site/mkdocs-dev.yml @@ -71,6 +71,7 @@ nav: - Redpanda: https://docs.redpanda.com/current/manage/iceberg/about-iceberg-topics - RisingWave: integrations/risingwave.md - Ryft: https://docs.ryft.io/platform + - Sail: https://docs.lakesail.com/sail/latest/guide/sources/iceberg - Snowflake: https://docs.snowflake.com/en/user-guide/tables-iceberg - Starburst: https://docs.starburst.io/latest/connector/iceberg.html - Starrocks: https://docs.starrocks.io/en-us/latest/data_source/catalog/iceberg_catalog diff --git a/site/nav.yml b/site/nav.yml index de1770c7ad78..dd2b0dce474f 100644 --- a/site/nav.yml +++ b/site/nav.yml @@ -85,6 +85,7 @@ nav: - Redpanda: https://docs.redpanda.com/current/manage/iceberg/about-iceberg-topics - RisingWave: integrations/risingwave.md - Ryft: https://docs.ryft.io/platform + - Sail: https://docs.lakesail.com/sail/latest/guide/sources/iceberg - Snowflake: https://docs.snowflake.com/en/user-guide/tables-iceberg - Starburst: https://docs.starburst.io/latest/connector/iceberg.html - Starrocks: https://docs.starrocks.io/en-us/latest/data_source/catalog/iceberg_catalog From 8808f7761637415f933b97c386f5646f3356896f Mon Sep 17 00:00:00 2001 From: Marius Grama Date: Tue, 14 Apr 2026 09:16:23 +0200 Subject: [PATCH 042/197] ADLS: Throw NotFoundException for inexistent input file (#15806) Signal to the TableOperations that there is no retry needed for files which do not exist. --- .../iceberg/azure/adlsv2/TestADLSFileIO.java | 20 ++++++++++++++++ .../azure/adlsv2/TestADLSInputStream.java | 22 ++++++++++++----- .../iceberg/azure/adlsv2/ADLSInputFile.java | 2 +- .../iceberg/azure/adlsv2/ADLSInputStream.java | 24 +++++++++++++++++++ .../azure/adlsv2/TestADLSInputStream.java | 8 ++++++- 5 files changed, 68 insertions(+), 8 deletions(-) diff --git a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java index 5e343782ab1c..621813a25574 100644 --- a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java +++ b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSFileIO.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.azure.AzureProperties.ADLS_SAS_TOKEN_PREFIX; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.mockito.ArgumentMatchers.anyBoolean; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.any; @@ -31,6 +32,7 @@ import com.azure.core.http.rest.PagedIterable; import com.azure.core.http.rest.Response; +import com.azure.storage.blob.models.BlobStorageException; import com.azure.storage.file.datalake.DataLakeFileClient; import com.azure.storage.file.datalake.DataLakeFileSystemClient; import com.azure.storage.file.datalake.DataLakeFileSystemClientBuilder; @@ -42,6 +44,7 @@ import java.util.Iterator; import org.apache.iceberg.TestHelpers; import org.apache.iceberg.azure.AzureProperties; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.FileInfo; import org.apache.iceberg.io.InputFile; @@ -78,6 +81,23 @@ public void testFileOperations() throws IOException { assertThat(fileClient.exists()).isFalse(); } + @Test + public void readMissingLocation() { + String path = "path/to/file"; + String location = AZURITE_CONTAINER.location(path); + ADLSFileIO io = createFileIO(); + DataLakeFileClient fileClient = AZURITE_CONTAINER.fileClient(path); + assertThat(fileClient.exists()).isFalse(); + + InputFile inputFile = io.newInputFile(location); + + assertThatThrownBy(inputFile::newStream) + .isInstanceOf(NotFoundException.class) + .hasCauseInstanceOf(BlobStorageException.class) + .hasMessage( + "Location does not exist: abfs://container@account.dfs.core.windows.net/path/to/file"); + } + @Test public void testBulkDeleteFiles() { String path1 = "path/to/file1"; diff --git a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java index 8464e57516ce..1edf48eaec35 100644 --- a/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java +++ b/azure/src/integration/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java @@ -39,6 +39,10 @@ public class TestADLSInputStream extends AzuriteTestBase { private final Random random = new Random(1); private final AzureProperties azureProperties = new AzureProperties(); + private String location() { + return AZURITE_CONTAINER.location(FILE_PATH); + } + private DataLakeFileClient fileClient() { return AZURITE_CONTAINER.fileClient(FILE_PATH); } @@ -55,7 +59,8 @@ public void testRead() throws Exception { setupData(data); try (SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { int readSize = 1024; readAndCheck(in, in.getPos(), readSize, data, false); @@ -90,7 +95,8 @@ public void testReadSingle() throws Exception { setupData(data); try (SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { assertThat(in.read()).isEqualTo(i0); assertThat(in.read()).isEqualTo(i1); } @@ -131,7 +137,8 @@ public void testRangeRead() throws Exception { setupData(expected); try (RangeReadable in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { // first 1k position = 0; offset = 0; @@ -164,7 +171,8 @@ private void readAndCheckRanges( public void testClose() throws Exception { setupData(randomData(2)); SeekableInputStream closed = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics()); + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics()); closed.close(); assertThatThrownBy(() -> closed.seek(0)) .isInstanceOf(IllegalStateException.class) @@ -178,7 +186,8 @@ public void testSeek() throws Exception { setupData(data); try (SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics())) { in.seek(data.length / 2); byte[] actual = new byte[data.length / 2]; @@ -193,7 +202,8 @@ public void testSeek() throws Exception { public void testSeekNegative() throws Exception { setupData(randomData(2)); SeekableInputStream in = - new ADLSInputStream(fileClient(), null, azureProperties, MetricsContext.nullMetrics()); + new ADLSInputStream( + location(), fileClient(), null, azureProperties, MetricsContext.nullMetrics()); assertThatThrownBy(() -> in.seek(-3)) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot seek: position -3 is negative"); diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java index 95e57bf04d32..5b07534f1368 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputFile.java @@ -55,6 +55,6 @@ public long getLength() { @Override public SeekableInputStream newStream() { - return new ADLSInputStream(fileClient(), fileSize, azureProperties(), metrics()); + return new ADLSInputStream(location(), fileClient(), fileSize, azureProperties(), metrics()); } } diff --git a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java index 55ecade4486a..b1a2d3abfa32 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java +++ b/azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSInputStream.java @@ -18,14 +18,18 @@ */ package org.apache.iceberg.azure.adlsv2; +import com.azure.storage.blob.models.BlobErrorCode; +import com.azure.storage.blob.models.BlobStorageException; import com.azure.storage.file.datalake.DataLakeFileClient; import com.azure.storage.file.datalake.models.DataLakeFileOpenInputStreamResult; +import com.azure.storage.file.datalake.models.DataLakeStorageException; import com.azure.storage.file.datalake.models.FileRange; import com.azure.storage.file.datalake.options.DataLakeFileInputStreamOptions; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import org.apache.iceberg.azure.AzureProperties; +import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.io.FileIOMetricsContext; import org.apache.iceberg.io.IOUtil; import org.apache.iceberg.io.RangeReadable; @@ -46,6 +50,7 @@ class ADLSInputStream extends SeekableInputStream implements RangeReadable { private static final int SKIP_SIZE = 1024 * 1024; private final StackTraceElement[] createStack; + private final String location; private final DataLakeFileClient fileClient; private Long fileSize; private final AzureProperties azureProperties; @@ -59,10 +64,12 @@ class ADLSInputStream extends SeekableInputStream implements RangeReadable { private final Counter readOperations; ADLSInputStream( + String location, DataLakeFileClient fileClient, Long fileSize, AzureProperties azureProperties, MetricsContext metrics) { + this.location = location; this.fileClient = fileClient; this.fileSize = fileSize; this.azureProperties = azureProperties; @@ -184,6 +191,7 @@ private DataLakeFileOpenInputStreamResult openRange(FileRange range) { try { return fileClient.openInputStream(getInputOptions(range)); } catch (RuntimeException e) { + throwNotFoundIfNotPresent(e, location); LOG.error( "Failed to open input stream for file {}, range {}", fileClient.getFilePath(), range, e); throw e; @@ -209,4 +217,20 @@ protected void finalize() throws Throwable { LOG.warn("Unclosed input stream created by:\n\t{}", trace); } } + + private static void throwNotFoundIfNotPresent(Throwable throwable, String location) { + if (isFileNotFoundException(throwable)) { + throw new NotFoundException(throwable, "Location does not exist: %s", location); + } + } + + private static boolean isFileNotFoundException(Throwable exception) { + if (exception instanceof BlobStorageException blobStorageException) { + return BlobErrorCode.BLOB_NOT_FOUND.equals(blobStorageException.getErrorCode()); + } + if (exception instanceof DataLakeStorageException dataLakeStorageException) { + return "PathNotFound".equals(dataLakeStorageException.getErrorCode()); + } + return false; + } } diff --git a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java index 058bf0372b05..e98061846a88 100644 --- a/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java +++ b/azure/src/test/java/org/apache/iceberg/azure/adlsv2/TestADLSInputStream.java @@ -46,7 +46,13 @@ void before() { InternalDataLakeFileOpenInputStreamResult openInputStreamResult = new InternalDataLakeFileOpenInputStreamResult(inputStream, mock()); when(fileClient.openInputStream(any())).thenReturn(openInputStreamResult); - adlsInputStream = new ADLSInputStream(fileClient, 0L, mock(), mock()); + adlsInputStream = + new ADLSInputStream( + "abfs://container@account.dfs.core.windows.net/path/to/file", + fileClient, + 0L, + mock(), + mock()); } @Test From 87c743463b6311f2412e1addf19cf204c1b79e3d Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Tue, 14 Apr 2026 16:18:09 +0900 Subject: [PATCH 043/197] Build: Ban toLowerCase/toUpperCase without locale (#15960) --- .baseline/checkstyle/checkstyle.xml | 5 +++++ baseline.gradle | 2 ++ 2 files changed, 7 insertions(+) diff --git a/.baseline/checkstyle/checkstyle.xml b/.baseline/checkstyle/checkstyle.xml index 943d299b338f..f94848450a9b 100644 --- a/.baseline/checkstyle/checkstyle.xml +++ b/.baseline/checkstyle/checkstyle.xml @@ -450,6 +450,11 @@ + + + + + diff --git a/baseline.gradle b/baseline.gradle index 4efbd89eda02..6b180effbbbf 100644 --- a/baseline.gradle +++ b/baseline.gradle @@ -157,6 +157,8 @@ subprojects { '-Xep:Slf4jThrowable:ERROR', // Added because it errors out compile, but we need to figure out if we want it '-Xep:StrictUnusedVariable:OFF', + // This rule doesn't enforce the use of method references. That's handled by checkstyle. + '-Xep:StringCaseLocaleUsage:ERROR', // Enforce safe string splitting '-Xep:StringSplitter:ERROR', '-Xep:TypeParameterShadowing:OFF', From ffb095db0c5f0bf9962a89a13f0a1953c740d90a Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Tue, 14 Apr 2026 22:05:11 +0200 Subject: [PATCH 044/197] API, Core: Move stats classes to core as package-private (#15971) This moves all stats related code into iceberg-core to avoid any potential API breakages before the spec has been finalized. It also moves all classes under the org.apache.iceberg package for usability/visibility in other classes v4-related classes. --- .../iceberg/{stats => }/BaseContentStats.java | 7 +++--- .../iceberg/{stats => }/BaseFieldStats.java | 4 ++-- .../org/apache/iceberg}/ContentStats.java | 5 ++-- .../org/apache/iceberg}/FieldStatistic.java | 4 ++-- .../java/org/apache/iceberg}/FieldStats.java | 5 ++-- .../java/org/apache/iceberg/MetricsUtil.java | 5 +--- .../java/org/apache/iceberg}/StatsUtil.java | 5 ++-- .../java/org/apache/iceberg/TrackedFile.java | 1 - .../iceberg/{stats => }/TestContentStats.java | 23 +++++++++---------- .../iceberg/{stats => }/TestFieldStats.java | 21 ++++++++--------- .../java/org/apache/iceberg/TestMetrics.java | 2 -- .../org/apache/iceberg}/TestStatsUtil.java | 19 ++++++++------- .../org/apache/iceberg/TestTrackedFile.java | 1 - 13 files changed, 44 insertions(+), 58 deletions(-) rename core/src/main/java/org/apache/iceberg/{stats => }/BaseContentStats.java (98%) rename core/src/main/java/org/apache/iceberg/{stats => }/BaseFieldStats.java (98%) rename {api/src/main/java/org/apache/iceberg/stats => core/src/main/java/org/apache/iceberg}/ContentStats.java (92%) rename {api/src/main/java/org/apache/iceberg/stats => core/src/main/java/org/apache/iceberg}/FieldStatistic.java (98%) rename {api/src/main/java/org/apache/iceberg/stats => core/src/main/java/org/apache/iceberg}/FieldStats.java (92%) rename {api/src/main/java/org/apache/iceberg/stats => core/src/main/java/org/apache/iceberg}/StatsUtil.java (98%) rename core/src/test/java/org/apache/iceberg/{stats => }/TestContentStats.java (95%) rename core/src/test/java/org/apache/iceberg/{stats => }/TestFieldStats.java (94%) rename {api/src/test/java/org/apache/iceberg/stats => core/src/test/java/org/apache/iceberg}/TestStatsUtil.java (95%) diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java b/core/src/main/java/org/apache/iceberg/BaseContentStats.java similarity index 98% rename from core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java rename to core/src/main/java/org/apache/iceberg/BaseContentStats.java index be56c411b6a7..45900b03e299 100644 --- a/core/src/main/java/org/apache/iceberg/stats/BaseContentStats.java +++ b/core/src/main/java/org/apache/iceberg/BaseContentStats.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import java.io.Serializable; import java.util.List; @@ -24,7 +24,6 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; -import org.apache.iceberg.Schema; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -33,14 +32,14 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -public class BaseContentStats implements ContentStats, Serializable { +class BaseContentStats implements ContentStats, Serializable { private final List> fieldStats; private final Map> fieldStatsById; private final Types.StructType statsStruct; /** Used by Avro reflection to instantiate this class when reading manifest files. */ - public BaseContentStats(Types.StructType projection) { + BaseContentStats(Types.StructType projection) { this.statsStruct = projection; this.fieldStats = Lists.newArrayListWithCapacity(projection.fields().size()); this.fieldStatsById = Maps.newLinkedHashMapWithExpectedSize(projection.fields().size()); diff --git a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java b/core/src/main/java/org/apache/iceberg/BaseFieldStats.java similarity index 98% rename from core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java rename to core/src/main/java/org/apache/iceberg/BaseFieldStats.java index 470303179bf5..11da570b8faa 100644 --- a/core/src/main/java/org/apache/iceberg/stats/BaseFieldStats.java +++ b/core/src/main/java/org/apache/iceberg/BaseFieldStats.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import java.nio.ByteBuffer; import java.nio.CharBuffer; @@ -28,7 +28,7 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; -public class BaseFieldStats extends SupportsIndexProjection implements FieldStats { +class BaseFieldStats extends SupportsIndexProjection implements FieldStats { private static final int[] IDENTITY_MAPPING = identityMapping(); private final int fieldId; private final Type type; diff --git a/api/src/main/java/org/apache/iceberg/stats/ContentStats.java b/core/src/main/java/org/apache/iceberg/ContentStats.java similarity index 92% rename from api/src/main/java/org/apache/iceberg/stats/ContentStats.java rename to core/src/main/java/org/apache/iceberg/ContentStats.java index b39db2565163..623a8eb39baf 100644 --- a/api/src/main/java/org/apache/iceberg/stats/ContentStats.java +++ b/core/src/main/java/org/apache/iceberg/ContentStats.java @@ -16,13 +16,12 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import java.util.List; -import org.apache.iceberg.StructLike; import org.apache.iceberg.types.Types; -public interface ContentStats extends StructLike { +interface ContentStats extends StructLike { /** A list of all the {@link FieldStats} */ List> fieldStats(); diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java b/core/src/main/java/org/apache/iceberg/FieldStatistic.java similarity index 98% rename from api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java rename to core/src/main/java/org/apache/iceberg/FieldStatistic.java index e3fd26a927fa..85712384254c 100644 --- a/api/src/main/java/org/apache/iceberg/stats/FieldStatistic.java +++ b/core/src/main/java/org/apache/iceberg/FieldStatistic.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -25,7 +25,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; -public enum FieldStatistic { +enum FieldStatistic { VALUE_COUNT(1, "value_count"), NULL_VALUE_COUNT(2, "null_value_count"), NAN_VALUE_COUNT(3, "nan_value_count"), diff --git a/api/src/main/java/org/apache/iceberg/stats/FieldStats.java b/core/src/main/java/org/apache/iceberg/FieldStats.java similarity index 92% rename from api/src/main/java/org/apache/iceberg/stats/FieldStats.java rename to core/src/main/java/org/apache/iceberg/FieldStats.java index 6411b479af49..e42d774c7cee 100644 --- a/api/src/main/java/org/apache/iceberg/stats/FieldStats.java +++ b/core/src/main/java/org/apache/iceberg/FieldStats.java @@ -16,12 +16,11 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; -import org.apache.iceberg.StructLike; import org.apache.iceberg.types.Type; -public interface FieldStats extends StructLike { +interface FieldStats extends StructLike { /** The field ID of the statistic */ int fieldId(); diff --git a/core/src/main/java/org/apache/iceberg/MetricsUtil.java b/core/src/main/java/org/apache/iceberg/MetricsUtil.java index 944e833b31d7..72c57a8bebcf 100644 --- a/core/src/main/java/org/apache/iceberg/MetricsUtil.java +++ b/core/src/main/java/org/apache/iceberg/MetricsUtil.java @@ -34,9 +34,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.stats.BaseContentStats; -import org.apache.iceberg.stats.BaseFieldStats; -import org.apache.iceberg.stats.ContentStats; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.TypeUtil; @@ -482,7 +479,7 @@ public void set(int pos, T value) { } } - public static ContentStats fromMetrics(Schema schema, Metrics metrics) { + static ContentStats fromMetrics(Schema schema, Metrics metrics) { if (null == metrics) { return null; } diff --git a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java b/core/src/main/java/org/apache/iceberg/StatsUtil.java similarity index 98% rename from api/src/main/java/org/apache/iceberg/stats/StatsUtil.java rename to core/src/main/java/org/apache/iceberg/StatsUtil.java index 2ff52f92bdda..39fef3d372d3 100644 --- a/api/src/main/java/org/apache/iceberg/stats/StatsUtil.java +++ b/core/src/main/java/org/apache/iceberg/StatsUtil.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -25,7 +25,6 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; -import org.apache.iceberg.Schema; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.TypeUtil; @@ -33,7 +32,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class StatsUtil { +class StatsUtil { private static final Logger LOG = LoggerFactory.getLogger(StatsUtil.class); // the number of reserved field IDs from the reserved field ID space as defined in // https://iceberg.apache.org/spec/#reserved-field-ids diff --git a/core/src/main/java/org/apache/iceberg/TrackedFile.java b/core/src/main/java/org/apache/iceberg/TrackedFile.java index 78bb7e5288d3..d15f9e582cb6 100644 --- a/core/src/main/java/org/apache/iceberg/TrackedFile.java +++ b/core/src/main/java/org/apache/iceberg/TrackedFile.java @@ -22,7 +22,6 @@ import java.util.Collections; import java.util.List; import java.util.Set; -import org.apache.iceberg.stats.ContentStats; import org.apache.iceberg.types.Types; /** A file tracked by a manifest. */ diff --git a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java b/core/src/test/java/org/apache/iceberg/TestContentStats.java similarity index 95% rename from core/src/test/java/org/apache/iceberg/stats/TestContentStats.java rename to core/src/test/java/org/apache/iceberg/TestContentStats.java index 6baff7dfe63e..0f06276d454b 100644 --- a/core/src/test/java/org/apache/iceberg/stats/TestContentStats.java +++ b/core/src/test/java/org/apache/iceberg/TestContentStats.java @@ -16,22 +16,21 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; - -import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS; -import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT; +package org.apache.iceberg; + +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; +import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; +import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; -import org.apache.iceberg.Schema; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.types.Types; @@ -154,7 +153,7 @@ public void retrievalByPosition() { assertThatThrownBy(() -> stats.get(0, Long.class)) .isInstanceOf(IllegalArgumentException.class) .hasMessageContaining( - "Wrong class, expected java.lang.Long but was org.apache.iceberg.stats.BaseFieldStats for object:"); + "Wrong class, expected java.lang.Long but was org.apache.iceberg.BaseFieldStats for object:"); } @Test diff --git a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java b/core/src/test/java/org/apache/iceberg/TestFieldStats.java similarity index 94% rename from core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java rename to core/src/test/java/org/apache/iceberg/TestFieldStats.java index be5f3166940d..c703a3044fc0 100644 --- a/core/src/test/java/org/apache/iceberg/stats/TestFieldStats.java +++ b/core/src/test/java/org/apache/iceberg/TestFieldStats.java @@ -16,16 +16,16 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; - -import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS; -import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT; +package org.apache.iceberg; + +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; +import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; +import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -33,7 +33,6 @@ import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.stream.Stream; -import org.apache.iceberg.TestHelpers; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; diff --git a/core/src/test/java/org/apache/iceberg/TestMetrics.java b/core/src/test/java/org/apache/iceberg/TestMetrics.java index d9048a5d5ed3..874bb6d74d44 100644 --- a/core/src/test/java/org/apache/iceberg/TestMetrics.java +++ b/core/src/test/java/org/apache/iceberg/TestMetrics.java @@ -41,8 +41,6 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.stats.ContentStats; -import org.apache.iceberg.stats.FieldStats; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.types.Types.BinaryType; diff --git a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java similarity index 95% rename from api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java rename to core/src/test/java/org/apache/iceberg/TestStatsUtil.java index 62c7c0ea75fb..54db9e5d2095 100644 --- a/api/src/test/java/org/apache/iceberg/stats/TestStatsUtil.java +++ b/core/src/test/java/org/apache/iceberg/TestStatsUtil.java @@ -16,23 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.stats; +package org.apache.iceberg; -import static org.apache.iceberg.stats.FieldStatistic.AVG_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.EXACT_BOUNDS; -import static org.apache.iceberg.stats.FieldStatistic.LOWER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.MAX_VALUE_SIZE; -import static org.apache.iceberg.stats.FieldStatistic.NAN_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.NULL_VALUE_COUNT; -import static org.apache.iceberg.stats.FieldStatistic.UPPER_BOUND; -import static org.apache.iceberg.stats.FieldStatistic.VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.AVG_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.EXACT_BOUNDS; +import static org.apache.iceberg.FieldStatistic.LOWER_BOUND; +import static org.apache.iceberg.FieldStatistic.MAX_VALUE_SIZE; +import static org.apache.iceberg.FieldStatistic.NAN_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.NULL_VALUE_COUNT; +import static org.apache.iceberg.FieldStatistic.UPPER_BOUND; +import static org.apache.iceberg.FieldStatistic.VALUE_COUNT; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; import java.util.List; import java.util.concurrent.ThreadLocalRandom; -import org.apache.iceberg.Schema; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFile.java b/core/src/test/java/org/apache/iceberg/TestTrackedFile.java index d468c9352d0e..6d84fd542345 100644 --- a/core/src/test/java/org/apache/iceberg/TestTrackedFile.java +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFile.java @@ -22,7 +22,6 @@ import static org.assertj.core.api.Assertions.assertThat; import java.util.List; -import org.apache.iceberg.stats.StatsUtil; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; From 5e3c28438570f0d526616e44d1d41f57aa49f585 Mon Sep 17 00:00:00 2001 From: Russell Spitzer Date: Tue, 14 Apr 2026 17:34:12 -0500 Subject: [PATCH 045/197] API: Relax partition name check when source column is dropped (#15967) Skip the identity name pairing when the partition source id no longer resolves in the schema, so historical specs do not block re-adding a column with the same name. Add API and Spark extension tests. --- .../org/apache/iceberg/PartitionSpec.java | 23 ++++++++++--------- .../iceberg/TestPartitionSpecValidation.java | 16 +++++++++++++ .../TestAlterTablePartitionFields.java | 13 +++++++++++ 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/PartitionSpec.java b/api/src/main/java/org/apache/iceberg/PartitionSpec.java index c9350077e9a6..90d2dc259dd1 100644 --- a/api/src/main/java/org/apache/iceberg/PartitionSpec.java +++ b/api/src/main/java/org/apache/iceberg/PartitionSpec.java @@ -402,21 +402,22 @@ private void checkAndAddPartitionName(String name, Integer sourceColumnId) { Types.NestedField schemaField = this.caseSensitive ? schema.findField(name) : schema.caseInsensitiveFindField(name); if (checkConflicts) { - if (sourceColumnId != null) { - // for identity transform case we allow conflicts between partition and schema field name - // as - // long as they are sourced from the same schema field - Preconditions.checkArgument( - schemaField == null || schemaField.fieldId() == sourceColumnId, - "Cannot create identity partition sourced from different field in schema: %s", - name); - } else { - // for all other transforms we don't allow conflicts between partition name and schema - // field name + if (sourceColumnId == null) { Preconditions.checkArgument( schemaField == null, "Cannot create partition from name that exists in schema: %s", name); + } else { + boolean sourceFieldExists = schema.findField(sourceColumnId) != null; + // For identity transforms, require the partition name to match the source column when it + // still exists in the schema. When the source was dropped, the spec may be historical; + // skip the identity name check in that case. + if (sourceFieldExists) { + Preconditions.checkArgument( + schemaField == null || schemaField.fieldId() == sourceColumnId, + "Cannot create identity partition sourced from different field in schema: %s", + name); + } } } Preconditions.checkArgument(!name.isEmpty(), "Cannot use empty partition name: %s", name); diff --git a/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java b/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java index b8e16a9ee45e..a1709d2a2e06 100644 --- a/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java +++ b/api/src/test/java/org/apache/iceberg/TestPartitionSpecValidation.java @@ -242,6 +242,22 @@ public void testSettingPartitionTransformsWithCustomTargetNamesThatAlreadyExist( "Cannot create identity partition sourced from different field in schema: another_ts"); } + @Test + public void testStalePartitionSourceIdWithReusedColumnName() { + int newFieldId = 2; + int droppedFieldId = 1; + Schema schema = + new Schema(NestedField.required(newFieldId, "category", Types.StringType.get())); + PartitionSpec spec = + PartitionSpec.builderFor(schema) + .withSpecId(0) + .add(droppedFieldId, 1000, "category", Transforms.alwaysNull()) + .build(); + assertThat(spec.fields()).hasSize(1); + assertThat(spec.fields().get(0).sourceId()).isEqualTo(droppedFieldId); + assertThat(spec.fields().get(0).name()).isEqualTo("category"); + } + @Test public void testMissingSourceColumn() { assertThatThrownBy(() -> PartitionSpec.builderFor(SCHEMA).year("missing").build()) diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java index 296564e20d4a..2db56fa844bb 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestAlterTablePartitionFields.java @@ -647,4 +647,17 @@ public void deleteAfterDroppingPartitionAndSourceColumn() { sql("DELETE FROM %s WHERE id >= 1", tableName); assertThat(sql("SELECT * FROM %s WHERE id >= 1", tableName)).isEmpty(); } + + @TestTemplate + public void testReaddColumnAfterIdentityPartitionDrop() { + createTable("id bigint NOT NULL, category string, data string", "category"); + + sql("ALTER TABLE %s DROP PARTITION FIELD category", tableName); + sql("ALTER TABLE %s DROP COLUMN category", tableName); + sql("ALTER TABLE %s ADD COLUMN category string", tableName); + + sql("INSERT INTO %s (id, category, data) VALUES (1, 'books', 'a')", tableName); + assertThat(sql("SELECT id, category, data FROM %s ORDER BY id", tableName)) + .containsExactly(row(1L, "books", "a")); + } } From dfe16c1abc188ebb09d956976b4eb87f00cdc477 Mon Sep 17 00:00:00 2001 From: Anoop Johnson Date: Wed, 15 Apr 2026 08:03:16 -0700 Subject: [PATCH 046/197] Core, API, Spark: Add FileContent.fromId (#15953) --- .../java/org/apache/iceberg/FileContent.java | 6 +++ .../org/apache/iceberg/TestFileContent.java | 48 +++++++++++++++++++ .../java/org/apache/iceberg/BaseFile.java | 4 +- .../iceberg/spark/SparkContentFile.java | 4 +- .../iceberg/spark/SparkContentFile.java | 4 +- .../iceberg/spark/SparkContentFile.java | 4 +- .../iceberg/spark/SparkContentFile.java | 4 +- 7 files changed, 60 insertions(+), 14 deletions(-) create mode 100644 api/src/test/java/org/apache/iceberg/TestFileContent.java diff --git a/api/src/main/java/org/apache/iceberg/FileContent.java b/api/src/main/java/org/apache/iceberg/FileContent.java index f977b02a9426..1ee1d290b767 100644 --- a/api/src/main/java/org/apache/iceberg/FileContent.java +++ b/api/src/main/java/org/apache/iceberg/FileContent.java @@ -26,6 +26,8 @@ public enum FileContent { DATA_MANIFEST(3), DELETE_MANIFEST(4); + private static final FileContent[] VALUES = FileContent.values(); + private final int id; FileContent(int id) { @@ -35,4 +37,8 @@ public enum FileContent { public int id() { return id; } + + public static FileContent fromId(int id) { + return VALUES[id]; + } } diff --git a/api/src/test/java/org/apache/iceberg/TestFileContent.java b/api/src/test/java/org/apache/iceberg/TestFileContent.java new file mode 100644 index 000000000000..bd5e44ed3cf3 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/TestFileContent.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.stream.IntStream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +class TestFileContent { + + @ParameterizedTest + @EnumSource(FileContent.class) + void fromId(FileContent content) { + assertThat(FileContent.fromId(content.id())).isEqualTo(content); + } + + static IntStream invalidContentTypeIds() { + return IntStream.of(-1, FileContent.values().length); + } + + @ParameterizedTest + @MethodSource("invalidContentTypeIds") + void fromIdInvalid(int id) { + assertThatThrownBy(() -> FileContent.fromId(id)) + .isInstanceOf(ArrayIndexOutOfBoundsException.class) + .hasMessageContaining(String.valueOf(id)); + } +} diff --git a/core/src/main/java/org/apache/iceberg/BaseFile.java b/core/src/main/java/org/apache/iceberg/BaseFile.java index a02e0eff55a2..3c31c50f099f 100644 --- a/core/src/main/java/org/apache/iceberg/BaseFile.java +++ b/core/src/main/java/org/apache/iceberg/BaseFile.java @@ -45,7 +45,7 @@ abstract class BaseFile extends SupportsIndexProjection StructLike, SpecificData.SchemaConstructable, Serializable { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); + static final Types.StructType EMPTY_STRUCT_TYPE = Types.StructType.of(); static final PartitionData EMPTY_PARTITION_DATA = new PartitionData(EMPTY_STRUCT_TYPE) { @@ -316,7 +316,7 @@ public void put(int i, Object value) { protected void internalSet(int pos, T value) { switch (pos) { case 0: - this.content = value != null ? FILE_CONTENT_VALUES[(Integer) value] : FileContent.DATA; + this.content = value != null ? FileContent.fromId((Integer) value) : FileContent.DATA; return; case 1: // always coerce to String for Serializable diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java index bad31d8d85f4..78d69eeaaf61 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkContentFile.java @@ -35,8 +35,6 @@ public abstract class SparkContentFile implements ContentFile { - private static final FileContent[] FILE_CONTENT_VALUES = FileContent.values(); - private final int fileContentPosition; private final int filePathPosition; private final int fileFormatPosition; @@ -139,7 +137,7 @@ public FileContent content() { if (wrapped.isNullAt(fileContentPosition)) { return null; } - return FILE_CONTENT_VALUES[wrapped.getInt(fileContentPosition)]; + return FileContent.fromId(wrapped.getInt(fileContentPosition)); } @Override From 893528c4d2cdded74fc440e1293604234cc5f565 Mon Sep 17 00:00:00 2001 From: Mukunda Rao Katta Date: Wed, 15 Apr 2026 09:09:59 -0700 Subject: [PATCH 047/197] Fix typos in javadoc/comment: 'intialize', 'seperated' (#15978) Co-authored-by: MukundaKatta --- .../java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java | 2 +- core/src/test/java/org/apache/iceberg/TestLocationProvider.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java b/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java index 4aec0bda2a13..3306163baffd 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java +++ b/aws/src/main/java/org/apache/iceberg/aws/S3FileIOAwsClientFactories.java @@ -32,7 +32,7 @@ private S3FileIOAwsClientFactories() {} /** * Attempts to load an AWS client factory class for S3 file IO defined in the catalog property * {@link S3FileIOProperties#CLIENT_FACTORY}. If the property wasn't set, fallback to {@link - * AwsClientFactories#from(Map) to intialize an AWS client factory class} + * AwsClientFactories#from(Map) to initialize an AWS client factory class} * * @param properties catalog properties * @return an instance of a factory class diff --git a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java index 146f2c8da5e7..d665d84cad82 100644 --- a/core/src/test/java/org/apache/iceberg/TestLocationProvider.java +++ b/core/src/test/java/org/apache/iceberg/TestLocationProvider.java @@ -295,7 +295,7 @@ public void testExcludePartitionInPath() { String fileLocation = table.locationProvider().newDataLocation(table.spec(), partitionData, "test.parquet"); - // no partition values included in the path and last part of entropy is seperated with "-" + // no partition values included in the path and last part of entropy is separated with "-" assertThat(fileLocation).endsWith("/data/0110/1010/0011/11101000-test.parquet"); } From b710f476c6509b25435fb573888bd34cea17a4a2 Mon Sep 17 00:00:00 2001 From: Robin Moffatt Date: Wed, 15 Apr 2026 18:52:33 +0100 Subject: [PATCH 048/197] Build: Fix codeql-action version comment to match pinned SHA (#15985) The pinned SHA c10b8064 is v4.35.1, not the rolling v4 tag. Update the comment to match, fixing the zizmor ref-version-mismatch finding. --- .github/workflows/codeql.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 81bc6b16f82e..fe0459aeb76f 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 with: category: "/language:actions" From 3e56b5287c24806f42dd1835cb8ffe94159a36c5 Mon Sep 17 00:00:00 2001 From: Anoop Johnson Date: Wed, 15 Apr 2026 10:58:17 -0700 Subject: [PATCH 049/197] Core: Add fromId to EntryStatus and ManifestEntry.Status (#15983) Move the cached values() array lookup into the enums themselves and update callers. This is a code cleanup similar to https://github.com/apache/iceberg/pull/15953 --- .../java/org/apache/iceberg/EntryStatus.java | 6 +++ .../apache/iceberg/GenericManifestEntry.java | 3 +- .../apache/iceberg/GenericManifestFile.java | 4 +- .../org/apache/iceberg/ManifestEntry.java | 6 +++ .../org/apache/iceberg/TestEntryStatus.java | 48 +++++++++++++++++++ .../iceberg/TestManifestEntryStatus.java | 48 +++++++++++++++++++ 6 files changed, 110 insertions(+), 5 deletions(-) create mode 100644 core/src/test/java/org/apache/iceberg/TestEntryStatus.java create mode 100644 core/src/test/java/org/apache/iceberg/TestManifestEntryStatus.java diff --git a/core/src/main/java/org/apache/iceberg/EntryStatus.java b/core/src/main/java/org/apache/iceberg/EntryStatus.java index a013f263d015..ceabeb562415 100644 --- a/core/src/main/java/org/apache/iceberg/EntryStatus.java +++ b/core/src/main/java/org/apache/iceberg/EntryStatus.java @@ -26,6 +26,8 @@ enum EntryStatus { /** Indicates an entry that has been replaced by a column update or DV change. Added in v4. */ REPLACED(3); + private static final EntryStatus[] VALUES = EntryStatus.values(); + private final int id; EntryStatus(int id) { @@ -35,4 +37,8 @@ enum EntryStatus { public int id() { return id; } + + static EntryStatus fromId(int id) { + return VALUES[id]; + } } diff --git a/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java b/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java index b2ce5fa2aa11..f154c982d1c7 100644 --- a/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java +++ b/core/src/main/java/org/apache/iceberg/GenericManifestEntry.java @@ -26,7 +26,6 @@ class GenericManifestEntry> implements ManifestEntry, IndexedRecord, SpecificData.SchemaConstructable, StructLike { - private static final Status[] STATUS_VALUES = Status.values(); private final org.apache.avro.Schema schema; private Status status = Status.EXISTING; private Long snapshotId = null; @@ -159,7 +158,7 @@ public void setFileSequenceNumber(long newFileSequenceNumber) { public void put(int i, Object v) { switch (i) { case 0: - this.status = STATUS_VALUES[(Integer) v]; + this.status = Status.fromId((Integer) v); return; case 1: this.snapshotId = (Long) v; diff --git a/core/src/main/java/org/apache/iceberg/GenericManifestFile.java b/core/src/main/java/org/apache/iceberg/GenericManifestFile.java index ac93222d01b5..9624484ffe0c 100644 --- a/core/src/main/java/org/apache/iceberg/GenericManifestFile.java +++ b/core/src/main/java/org/apache/iceberg/GenericManifestFile.java @@ -40,8 +40,6 @@ public class GenericManifestFile extends SupportsIndexProjection implements ManifestFile, StructLike, IndexedRecord, SchemaConstructable, Serializable { private static final Schema AVRO_SCHEMA = AvroSchemaUtil.convert(ManifestFile.schema(), "manifest_file"); - private static final ManifestContent[] MANIFEST_CONTENT_VALUES = ManifestContent.values(); - private transient Schema avroSchema; // not final for Java serialization // data fields @@ -343,7 +341,7 @@ protected void internalSet(int basePos, T value) { return; case 3: this.content = - value != null ? MANIFEST_CONTENT_VALUES[(Integer) value] : ManifestContent.DATA; + value != null ? ManifestContent.fromId((Integer) value) : ManifestContent.DATA; return; case 4: this.sequenceNumber = value != null ? (Long) value : 0; diff --git a/core/src/main/java/org/apache/iceberg/ManifestEntry.java b/core/src/main/java/org/apache/iceberg/ManifestEntry.java index 4dce92cf5c2f..635231069ffc 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestEntry.java +++ b/core/src/main/java/org/apache/iceberg/ManifestEntry.java @@ -30,6 +30,8 @@ enum Status { ADDED(1), DELETED(2); + private static final Status[] VALUES = Status.values(); + private final int id; Status(int id) { @@ -39,6 +41,10 @@ enum Status { public int id() { return id; } + + static Status fromId(int id) { + return VALUES[id]; + } } // ids for data-file columns are assigned from 1000 diff --git a/core/src/test/java/org/apache/iceberg/TestEntryStatus.java b/core/src/test/java/org/apache/iceberg/TestEntryStatus.java new file mode 100644 index 000000000000..c395cdcece7c --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestEntryStatus.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.stream.IntStream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +class TestEntryStatus { + + @ParameterizedTest + @EnumSource(EntryStatus.class) + void fromId(EntryStatus status) { + assertThat(EntryStatus.fromId(status.id())).isEqualTo(status); + } + + static IntStream invalidIds() { + return IntStream.of(-1, EntryStatus.values().length); + } + + @ParameterizedTest + @MethodSource("invalidIds") + void fromIdInvalid(int id) { + assertThatThrownBy(() -> EntryStatus.fromId(id)) + .isInstanceOf(ArrayIndexOutOfBoundsException.class) + .hasMessageContaining(String.valueOf(id)); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestManifestEntryStatus.java b/core/src/test/java/org/apache/iceberg/TestManifestEntryStatus.java new file mode 100644 index 000000000000..39867bbf7c02 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestManifestEntryStatus.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.util.stream.IntStream; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; +import org.junit.jupiter.params.provider.MethodSource; + +class TestManifestEntryStatus { + + @ParameterizedTest + @EnumSource(ManifestEntry.Status.class) + void fromId(ManifestEntry.Status status) { + assertThat(ManifestEntry.Status.fromId(status.id())).isEqualTo(status); + } + + static IntStream invalidIds() { + return IntStream.of(-1, ManifestEntry.Status.values().length); + } + + @ParameterizedTest + @MethodSource("invalidIds") + void fromIdInvalid(int id) { + assertThatThrownBy(() -> ManifestEntry.Status.fromId(id)) + .isInstanceOf(ArrayIndexOutOfBoundsException.class) + .hasMessageContaining(String.valueOf(id)); + } +} From fa4e978ca8cc1d4e8a035294c64a837ad60ce93b Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 15 Apr 2026 12:08:44 -0700 Subject: [PATCH 050/197] ci: remove zizmor ignore for allowlist-check, pin to main (#15987) --- .github/workflows/asf-allowlist-check.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/asf-allowlist-check.yml b/.github/workflows/asf-allowlist-check.yml index d4e84c5922c8..65dbe8bcbee9 100644 --- a/.github/workflows/asf-allowlist-check.yml +++ b/.github/workflows/asf-allowlist-check.yml @@ -43,5 +43,4 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: persist-credentials: false - # Intentionally unpinned to always use the latest allowlist from the ASF. - - uses: apache/infrastructure-actions/allowlist-check@main # zizmor: ignore[unpinned-uses] + - uses: apache/infrastructure-actions/allowlist-check@4e9c961f587f72b170874b6f5cd4ac15f7f26eb8 # main From 304c00f3c17a67a7eb2306cf018fe8ba5647e4b1 Mon Sep 17 00:00:00 2001 From: Oguzhan Unlu Date: Thu, 16 Apr 2026 00:51:10 +0300 Subject: [PATCH 051/197] Spec: Add 404 response for config endpoint (#15746) --- open-api/rest-catalog-open-api.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 4b4c9f6730ec..2ef154f18f26 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -162,6 +162,15 @@ paths: $ref: '#/components/responses/UnauthorizedResponse' 403: $ref: '#/components/responses/ForbiddenResponse' + 404: + description: Not Found - Warehouse provided in the `warehouse` query parameter is not found. + content: + application/json: + schema: + $ref: '#/components/schemas/IcebergErrorResponse' + examples: + NoSuchWarehouseExample: + $ref: '#/components/examples/NoSuchWarehouseError' 419: $ref: '#/components/responses/AuthenticationTimeoutResponse' 503: @@ -5168,6 +5177,16 @@ components: } } + NoSuchWarehouseError: + summary: The requested warehouse does not exist + value: { + "error": { + "message": "The given warehouse does not exist", + "type": "NoSuchWarehouseException", + "code": 404 + } + } + NoSuchNamespaceError: summary: The requested namespace does not exist value: { From 44145262d2e4e8b79b1e681f8182af72f43ae66d Mon Sep 17 00:00:00 2001 From: Sebastian Baunsgaard Date: Thu, 16 Apr 2026 00:26:29 +0200 Subject: [PATCH 052/197] Core: Optimize RoaringPositionBitmap.setRange with native range API (#15791) * Core: Optimize RoaringPositionBitmap.setRange with native bulk range add --- .../deletes/RoaringPositionBitmap.java | 37 +++++- .../deletes/TestRoaringPositionBitmap.java | 123 +++++++++++++++++- 2 files changed, 155 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java b/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java index 3f7613c6ea58..037a38b114a6 100644 --- a/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java +++ b/core/src/main/java/org/apache/iceberg/deletes/RoaringPositionBitmap.java @@ -79,14 +79,45 @@ public void set(long pos) { } /** - * Sets a range of positions in the bitmap. + * Sets a range of positions in the bitmap. If {@code posStartInclusive} equals {@code + * posEndExclusive}, this method does nothing. * * @param posStartInclusive the start position of the range (inclusive) * @param posEndExclusive the end position of the range (exclusive) + * @throws IllegalArgumentException if posStartInclusive > posEndExclusive */ public void setRange(long posStartInclusive, long posEndExclusive) { - for (long pos = posStartInclusive; pos < posEndExclusive; pos++) { - set(pos); + Preconditions.checkArgument( + posStartInclusive <= posEndExclusive, + "Start position must not exceed end position: [%s, %s)", + posStartInclusive, + posEndExclusive); + + if (posStartInclusive == posEndExclusive) { + return; + } + + validatePosition(posStartInclusive); + validatePosition(posEndExclusive - 1); + + int startKey = key(posStartInclusive); + int endKey = key(posEndExclusive - 1); + allocateBitmapsIfNeeded(endKey + 1); + + if (startKey == endKey) { + long lowStart = Integer.toUnsignedLong(pos32Bits(posStartInclusive)); + long lowEnd = Integer.toUnsignedLong(pos32Bits(posEndExclusive - 1)) + 1; + bitmaps[startKey].add(lowStart, lowEnd); + } else { + long firstLowStart = Integer.toUnsignedLong(pos32Bits(posStartInclusive)); + bitmaps[startKey].add(firstLowStart, 1L << 32); + + for (int key = startKey + 1; key < endKey; key++) { + bitmaps[key].add(0L, 1L << 32); + } + + long lastLowEnd = Integer.toUnsignedLong(pos32Bits(posEndExclusive - 1)) + 1; + bitmaps[endKey].add(0L, lastLowEnd); } } diff --git a/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java b/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java index 2daf0382973b..68b73ed0a218 100644 --- a/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java +++ b/core/src/test/java/org/apache/iceberg/deletes/TestRoaringPositionBitmap.java @@ -148,9 +148,116 @@ public void testAddRangeAcrossKeys() { @TestTemplate public void testAddEmptyRange() { + RoaringPositionBitmap equalRange = new RoaringPositionBitmap(); + equalRange.setRange(10, 10); + assertThat(equalRange.isEmpty()).isTrue(); + assertThat(equalRange.cardinality()).isEqualTo(0); + assertThat(equalRange.contains(10)).isFalse(); + } + + @TestTemplate + public void testSetRangeReversedThrows() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + assertThatThrownBy(() -> bitmap.setRange(100, 50)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Start position must not exceed end position"); + } + + @TestTemplate + public void testAddRangeLargeContiguous() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long start = 500L; + long end = 200_500L; + bitmap.setRange(start, end); + + assertThat(bitmap.cardinality()).isEqualTo(200_000L); + assertThat(bitmap.contains(start)).isTrue(); + assertThat(bitmap.contains(end - 1)).isTrue(); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + } + + @TestTemplate + public void testAddRangeSpanningThreeKeys() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long start = ((long) 0 << 32) | 0xFFFFFFF0L; + long end = ((long) 2 << 32) | 0x10L; + bitmap.setRange(start, end); + + assertThat(bitmap.contains(start)).isTrue(); + assertThat(bitmap.contains(end - 1)).isTrue(); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + + // key 1 should be fully covered + assertThat(bitmap.contains((long) 1 << 32)).isTrue(); + assertThat(bitmap.contains(((long) 1 << 32) | 0xFFFFFFFFL)).isTrue(); + + long expectedCardinality = end - start; + assertThat(bitmap.cardinality()).isEqualTo(expectedCardinality); + } + + @TestTemplate + public void testAddRangeSinglePosition() { + RoaringPositionBitmap rangeBitmap = new RoaringPositionBitmap(); + rangeBitmap.setRange(42, 43); + + RoaringPositionBitmap setBitmap = new RoaringPositionBitmap(); + setBitmap.set(42); + + assertThat(rangeBitmap.cardinality()).isEqualTo(setBitmap.cardinality()); + assertThat(rangeBitmap.contains(42)).isTrue(); + assertThat(rangeBitmap.contains(41)).isFalse(); + assertThat(rangeBitmap.contains(43)).isFalse(); + } + + @TestTemplate + public void testAddRangeAtKeyBoundary() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + bitmap.setRange(0L, 1L << 32); + + assertThat(bitmap.cardinality()).isEqualTo(1L << 32); + assertThat(bitmap.contains(0L)).isTrue(); + assertThat(bitmap.contains((1L << 32) - 1)).isTrue(); + assertThat(bitmap.contains(1L << 32)).isFalse(); + assertThat(bitmap.allocatedBitmapCount()).isEqualTo(1); + } + + @TestTemplate + public void testAddRangeSameKeyForEachExact() { RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); - bitmap.setRange(10, 10); - assertThat(bitmap.isEmpty()).isTrue(); + + long start = 1000L; + long end = 1200L; + bitmap.setRange(start, end); + + assertThat(bitmap.cardinality()).isEqualTo(end - start); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + + for (long pos = start; pos < end; pos++) { + assertThat(bitmap.contains(pos)).isTrue(); + } + } + + @TestTemplate + public void testAddRangeCrossKeyForEachExact() { + RoaringPositionBitmap bitmap = new RoaringPositionBitmap(); + + long start = ((long) 1 << 32) - 100L; + long end = ((long) 1 << 32) + 100L; + bitmap.setRange(start, end); + + assertThat(bitmap.cardinality()).isEqualTo(end - start); + assertThat(bitmap.contains(start - 1)).isFalse(); + assertThat(bitmap.contains(end)).isFalse(); + + for (long pos = start; pos < end; pos++) { + assertThat(bitmap.contains(pos)).isTrue(); + } } @TestTemplate @@ -357,6 +464,18 @@ public void testUnsupportedPositions() { .hasMessageContaining( "Bitmap supports positions that are >= 0 and <= %s", RoaringPositionBitmap.MAX_POSITION); + + assertThatThrownBy(() -> bitmap.setRange(-1L, 1L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); + + assertThatThrownBy(() -> bitmap.setRange(0L, RoaringPositionBitmap.MAX_POSITION + 2L)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining( + "Bitmap supports positions that are >= 0 and <= %s", + RoaringPositionBitmap.MAX_POSITION); } @TestTemplate From 8f30d8350bdac64e67e3778cc9489f07a57bc2e7 Mon Sep 17 00:00:00 2001 From: gaborkaszab Date: Thu, 16 Apr 2026 00:54:06 +0200 Subject: [PATCH 053/197] Core: Introduce default values in RESTCatalogProperties (#15873) NAMESPACE_SEPARATOR and SCAN_PLANNING_MODE doesn't have their default values in RESTCatalogProperties. To improve code redability, this change introduces their default to be at the same place. --- .../java/org/apache/iceberg/rest/RESTCatalogProperties.java | 5 ++++- .../java/org/apache/iceberg/rest/RESTSessionCatalog.java | 6 +++--- .../main/java/org/apache/iceberg/rest/ResourcePaths.java | 2 +- .../java/org/apache/iceberg/rest/RESTCatalogAdapter.java | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java b/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java index c79bf2477228..9f4d8835a71f 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java @@ -28,7 +28,7 @@ public final class RESTCatalogProperties { private RESTCatalogProperties() {} public static final String SNAPSHOT_LOADING_MODE = "snapshot-loading-mode"; - public static final String SNAPSHOT_LOADING_MODE_DEFAULT = SnapshotMode.ALL.name(); + public static final SnapshotMode SNAPSHOT_LOADING_MODE_DEFAULT = SnapshotMode.ALL; public static final String SNAPSHOTS_QUERY_PARAMETER = "snapshots"; public static final String METRICS_REPORTING_ENABLED = "rest-metrics-reporting-enabled"; @@ -42,10 +42,13 @@ private RESTCatalogProperties() {} public static final String PAGE_SIZE = "rest-page-size"; public static final String NAMESPACE_SEPARATOR = "namespace-separator"; + public static final String NAMESPACE_SEPARATOR_DEFAULT = + RESTUtil.NAMESPACE_SEPARATOR_URLENCODED_UTF_8; // Configure scan planning mode // Can be set by server in LoadTableResponse.config() for table-level override public static final String SCAN_PLANNING_MODE = "scan-planning-mode"; + public static final ScanPlanningMode SCAN_PLANNING_MODE_DEFAULT = ScanPlanningMode.CLIENT; public static final String REST_SCAN_PLAN_ID = "rest-scan-plan-id"; diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java index cbdf17a8ebbe..c7b5b5d41c74 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java @@ -265,7 +265,7 @@ public void initialize(String name, Map unresolved) { PropertyUtil.propertyAsString( mergedProps, RESTCatalogProperties.SNAPSHOT_LOADING_MODE, - RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT) + RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT.name()) .toUpperCase(Locale.US)); this.reporter = CatalogUtil.loadMetricsReporter(mergedProps); @@ -279,7 +279,7 @@ public void initialize(String name, Map unresolved) { PropertyUtil.propertyAsString( mergedProps, RESTCatalogProperties.NAMESPACE_SEPARATOR, - RESTUtil.NAMESPACE_SEPARATOR_URLENCODED_UTF_8); + RESTCatalogProperties.NAMESPACE_SEPARATOR_DEFAULT); this.tableCache = createTableCache(mergedProps); this.closeables.addCloseable(this.tableCache); @@ -615,7 +615,7 @@ private RESTTable restTableForScanPlanning( RESTCatalogProperties.ScanPlanningMode effectiveMode = effectiveModeConfig != null ? RESTCatalogProperties.ScanPlanningMode.fromString(effectiveModeConfig) - : RESTCatalogProperties.ScanPlanningMode.CLIENT; + : RESTCatalogProperties.SCAN_PLANNING_MODE_DEFAULT; if (effectiveMode == RESTCatalogProperties.ScanPlanningMode.SERVER) { Preconditions.checkState( diff --git a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java index 0fc55c1a44d8..a5dea35bf1c9 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java +++ b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java @@ -57,7 +57,7 @@ public static ResourcePaths forCatalogProperties(Map properties) PropertyUtil.propertyAsString( properties, RESTCatalogProperties.NAMESPACE_SEPARATOR, - RESTUtil.NAMESPACE_SEPARATOR_URLENCODED_UTF_8)); + RESTCatalogProperties.NAMESPACE_SEPARATOR_DEFAULT)); } public static String config() { diff --git a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java index 8ba5daef3f9b..8c6dc52b1575 100644 --- a/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java +++ b/core/src/test/java/org/apache/iceberg/rest/RESTCatalogAdapter.java @@ -746,7 +746,7 @@ private static SnapshotMode snapshotModeFromQueryParams(Map quer queryParams .getOrDefault( RESTCatalogProperties.SNAPSHOTS_QUERY_PARAMETER, - RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT) + RESTCatalogProperties.SNAPSHOT_LOADING_MODE_DEFAULT.name()) .toUpperCase(Locale.US)); } } From 9418842100e7b0e360a13df95d85b9b823f55426 Mon Sep 17 00:00:00 2001 From: Sreesh Maheshwar Date: Wed, 15 Apr 2026 16:26:44 -0700 Subject: [PATCH 054/197] Hive encryption nits (#14659) * Hive encryption clean-ups * Fix tests * Address review comments * Nit improvements --------- Co-authored-by: Sreesh Maheshwar Co-authored-by: Claude Opus 4.6 (1M context) --- .../iceberg/hive/HiveTableOperations.java | 41 ++++++++++--------- .../spark/sql/TestTableEncryption.java | 19 ++++++++- .../spark/sql/TestTableEncryption.java | 19 ++++++++- 3 files changed, 56 insertions(+), 23 deletions(-) diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java index 68aedebf4771..1038d907e718 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveTableOperations.java @@ -35,6 +35,7 @@ import org.apache.hadoop.hive.metastore.api.Table; import org.apache.iceberg.BaseMetastoreOperations; import org.apache.iceberg.BaseMetastoreTableOperations; +import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.ClientPool; import org.apache.iceberg.LocationProviders; import org.apache.iceberg.TableMetadata; @@ -56,8 +57,9 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.LocationProvider; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.PropertyUtil; import org.apache.thrift.TException; import org.slf4j.Logger; @@ -142,15 +144,17 @@ public EncryptionManager encryption() { } if (tableKeyId != null) { - if (keyManagementClient == null) { - throw new RuntimeException( - "Can't create encryption manager, because key management client is not set"); - } - - Map encryptionProperties = Maps.newHashMap(); - encryptionProperties.put(TableProperties.ENCRYPTION_TABLE_KEY, tableKeyId); - encryptionProperties.put( - TableProperties.ENCRYPTION_DEK_LENGTH, String.valueOf(encryptionDekLength)); + Preconditions.checkArgument( + keyManagementClient != null, + "Cannot create encryption manager without a key management client. Consider setting the '%s' catalog property", + CatalogProperties.ENCRYPTION_KMS_IMPL); + + Map encryptionProperties = + ImmutableMap.of( + TableProperties.ENCRYPTION_TABLE_KEY, + tableKeyId, + TableProperties.ENCRYPTION_DEK_LENGTH, + String.valueOf(encryptionDekLength)); encryptionManager = EncryptionUtil.createEncryptionManager( @@ -312,17 +316,16 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { base.properties().keySet().stream() .filter(key -> !tableMetadata.properties().containsKey(key)) .collect(Collectors.toSet()); - } - if (removedProps.contains(TableProperties.ENCRYPTION_TABLE_KEY)) { - throw new IllegalArgumentException("Cannot remove key in encrypted table"); - } + Preconditions.checkArgument( + !removedProps.contains(TableProperties.ENCRYPTION_TABLE_KEY), + "Cannot remove key ID from an encrypted table"); - if (base != null - && !Objects.equals( - base.properties().get(TableProperties.ENCRYPTION_TABLE_KEY), - metadata.properties().get(TableProperties.ENCRYPTION_TABLE_KEY))) { - throw new IllegalArgumentException("Cannot modify key in encrypted table"); + Preconditions.checkArgument( + Objects.equals( + base.properties().get(TableProperties.ENCRYPTION_TABLE_KEY), + metadata.properties().get(TableProperties.ENCRYPTION_TABLE_KEY)), + "Cannot modify key ID of an encrypted table"); } HMSTablePropertyHelper.updateHmsTableForIcebergTable( diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java index a38506d621f9..3b36b7bb0a25 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java @@ -56,6 +56,7 @@ import org.apache.iceberg.spark.SparkCatalogConfig; import org.apache.iceberg.types.Types; import org.apache.parquet.crypto.ParquetCryptoRuntimeException; +import org.apache.spark.SparkException; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -248,14 +249,28 @@ public void testMetadataTamperproofing() throws IOException { public void testKeyDelete() { assertThatThrownBy( () -> sql("ALTER TABLE %s UNSET TBLPROPERTIES (`encryption.key-id`)", tableName)) - .hasMessageContaining("Cannot remove key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot remove key ID from an encrypted table"); } @TestTemplate public void testKeyAlter() { assertThatThrownBy( () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('encryption.key-id'='abcd')", tableName)) - .hasMessageContaining("Cannot modify key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot modify key ID of an encrypted table"); + } + + @TestTemplate + public void testReplaceKeyChange() { + // Replacing a table with a different encryption key is disallowed + assertThatThrownBy( + () -> + sql( + "REPLACE TABLE %s (id bigint) USING iceberg TBLPROPERTIES ('encryption.key-id'='%s')", + tableName, UnitestKMS.MASTER_KEY_NAME2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot modify key ID of an encrypted table"); } @TestTemplate diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java index a38506d621f9..3b36b7bb0a25 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestTableEncryption.java @@ -56,6 +56,7 @@ import org.apache.iceberg.spark.SparkCatalogConfig; import org.apache.iceberg.types.Types; import org.apache.parquet.crypto.ParquetCryptoRuntimeException; +import org.apache.spark.SparkException; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -248,14 +249,28 @@ public void testMetadataTamperproofing() throws IOException { public void testKeyDelete() { assertThatThrownBy( () -> sql("ALTER TABLE %s UNSET TBLPROPERTIES (`encryption.key-id`)", tableName)) - .hasMessageContaining("Cannot remove key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot remove key ID from an encrypted table"); } @TestTemplate public void testKeyAlter() { assertThatThrownBy( () -> sql("ALTER TABLE %s SET TBLPROPERTIES ('encryption.key-id'='abcd')", tableName)) - .hasMessageContaining("Cannot modify key in encrypted table"); + .isInstanceOf(SparkException.class) + .hasMessage("Unsupported table change: Cannot modify key ID of an encrypted table"); + } + + @TestTemplate + public void testReplaceKeyChange() { + // Replacing a table with a different encryption key is disallowed + assertThatThrownBy( + () -> + sql( + "REPLACE TABLE %s (id bigint) USING iceberg TBLPROPERTIES ('encryption.key-id'='%s')", + tableName, UnitestKMS.MASTER_KEY_NAME2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Cannot modify key ID of an encrypted table"); } @TestTemplate From 20ca6c3f88fc237d4af48db4295a9915fb3e958f Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Wed, 15 Apr 2026 18:02:01 -0700 Subject: [PATCH 055/197] Update Rust status on the site (#15709) --- site/docs/status.md | 54 ++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/site/docs/status.md b/site/docs/status.md index 22527a6751cf..4ca603e6f5e1 100644 --- a/site/docs/status.md +++ b/site/docs/status.md @@ -86,9 +86,9 @@ This section lists the libraries that implement the Apache Iceberg specification | Update schema | Y | Y | Y | N | Y | | Update partition spec | Y | Y | Y | N | Y | | Update table properties | Y | Y | Y | Y | Y | -| Replace sort order | Y | N | N | N | Y | -| Update table location | Y | Y | N | N | Y | -| Update statistics | Y | Y | N | N | Y | +| Replace sort order | Y | N | Y | N | Y | +| Update table location | Y | Y | Y | N | Y | +| Update statistics | Y | Y | Y | N | Y | | Update partition statistics | Y | N | N | N | N | | Expire snapshots | Y | N | N | N | N | | Manage snapshots | Y | N | N | N | N | @@ -100,9 +100,9 @@ This section lists the libraries that implement the Apache Iceberg specification | Update schema | Y | Y | N | N | Y | | Update partition spec | Y | Y | N | N | Y | | Update table properties | Y | Y | Y | Y | Y | -| Replace sort order | Y | N | N | N | Y | -| Update table location | Y | Y | N | N | Y | -| Update statistics | Y | Y | N | N | Y | +| Replace sort order | Y | N | Y | N | Y | +| Update table location | Y | Y | Y | N | Y | +| Update statistics | Y | Y | Y | N | Y | | Update partition statistics | Y | N | N | N | N | | Expire snapshots | Y | N | N | N | N | | Manage snapshots | Y | N | N | N | N | @@ -113,7 +113,7 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-------------------|------|-----------|------|----|-----| -| Append data files | Y | Y | N | Y | Y | +| Append data files | Y | Y | Y | Y | Y | | Rewrite files | Y | Y | N | N | N | | Rewrite manifests | Y | Y | N | Y | N | | Overwrite files | Y | Y | N | N | N | @@ -123,7 +123,7 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-------------------|------|-----------|------|----|-----| -| Append data files | Y | Y | N | Y | Y | +| Append data files | Y | Y | Y | Y | Y | | Rewrite files | Y | Y | N | N | N | | Rewrite manifests | Y | Y | N | Y | N | | Overwrite files | Y | Y | N | N | N | @@ -145,12 +145,12 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-----------------------------|------|-----------|------|----|-----| | Plan with data file | Y | Y | Y | Y | Y | -| Plan with position deletes | Y | Y | N | Y | Y | -| Plan with equality deletes | Y | Y | N | N | Y | +| Plan with position deletes | Y | Y | Y | Y | Y | +| Plan with equality deletes | Y | Y | Y | N | Y | | Plan with puffin statistics | Y | N | N | N | N | | Read data file | Y | Y | Y | Y | Y | -| Read with position deletes | Y | Y | N | Y | N | -| Read with equality deletes | Y | N | N | N | N | +| Read with position deletes | Y | Y | Y | Y | N | +| Read with equality deletes | Y | N | Y | N | N | ## Table Write Operations @@ -166,7 +166,7 @@ This section lists the libraries that implement the Apache Iceberg specification |------------------------|------|-----------|------|----|-----| | Append data | Y | Y | Y | Y | N | | Write position deletes | Y | N | N | N | N | -| Write equality deletes | Y | N | N | N | N | +| Write equality deletes | Y | N | Y | N | N | ## Catalogs @@ -267,12 +267,12 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | Namespace Operation | Java | PyIceberg | Rust | Go | C++ | |---------------------------|------|-----------|------|----|-----| -| listNamespaces | Y | Y | N | Y | N | -| createNamespace | Y | Y | N | Y | N | +| listNamespaces | Y | Y | Y | Y | N | +| createNamespace | Y | Y | Y | Y | N | | dropNamespace | Y | Y | Y | Y | N | -| namespaceExists | Y | N | N | Y | N | +| namespaceExists | Y | N | Y | Y | N | | updateNamespaceProperties | Y | Y | Y | Y | N | -| loadNamespaceMetadata | Y | Y | N | Y | N | +| loadNamespaceMetadata | Y | Y | Y | Y | N | ### Glue Catalog @@ -315,12 +315,12 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | Namespace Operation | Java | PyIceberg | Rust | Go | C++ | |---------------------------|------|-----------|------|----|-----| -| listNamespaces | Y | Y | N | Y | N | -| createNamespace | Y | Y | N | Y | N | -| dropNamespace | Y | Y | N | Y | N | -| namespaceExists | Y | N | N | Y | N | +| listNamespaces | Y | Y | Y | Y | N | +| createNamespace | Y | Y | Y | Y | N | +| dropNamespace | Y | Y | Y | Y | N | +| namespaceExists | Y | N | Y | Y | N | | updateNamespaceProperties | Y | Y | Y | Y | N | -| loadNamespaceMetadata | Y | Y | N | Y | N | +| loadNamespaceMetadata | Y | Y | Y | Y | N | ### Hive Metastore Catalog @@ -363,9 +363,9 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | Namespace Operation | Java | PyIceberg | Rust | Go | C++ | |---------------------------|------|-----------|------|----|-----| -| listNamespaces | Y | Y | N | N | N | -| createNamespace | Y | Y | N | N | N | -| dropNamespace | Y | Y | N | N | N | -| namespaceExists | Y | N | N | N | N | +| listNamespaces | Y | Y | Y | N | N | +| createNamespace | Y | Y | Y | N | N | +| dropNamespace | Y | Y | Y | N | N | +| namespaceExists | Y | N | Y | N | N | | updateNamespaceProperties | Y | Y | Y | Y | N | -| loadNamespaceMetadata | Y | Y | N | N | N | +| loadNamespaceMetadata | Y | Y | Y | N | N | From 0babf7d171d45fd0b2ab7de55f5002c2fbe0cf61 Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Thu, 16 Apr 2026 12:09:36 +0900 Subject: [PATCH 056/197] Core: Fix StructLikeWrapper.equals exception with mismatched partition types (#15945) --- .../iceberg/util/StructLikeWrapper.java | 8 +++- .../iceberg/util/TestStructLikeWrapper.java | 47 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 core/src/test/java/org/apache/iceberg/util/TestStructLikeWrapper.java diff --git a/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java b/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java index 28629706bf5e..2e71d2419185 100644 --- a/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java +++ b/core/src/main/java/org/apache/iceberg/util/StructLikeWrapper.java @@ -88,7 +88,13 @@ public boolean equals(Object other) { return false; } - return comparator.compare(this.struct, that.struct) == 0; + try { + return comparator.compare(this.struct, that.struct) == 0; + } catch (RuntimeException e) { + // An exception may occur, for example, when struct is PartitionData and its type does not + // match its data. + return false; + } } @Override diff --git a/core/src/test/java/org/apache/iceberg/util/TestStructLikeWrapper.java b/core/src/test/java/org/apache/iceberg/util/TestStructLikeWrapper.java new file mode 100644 index 000000000000..9eaa45c85a48 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/util/TestStructLikeWrapper.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.util; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.PartitionData; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +public class TestStructLikeWrapper { + @Test + public void equalsTypeAndDataMismatch() { + Types.StructType intType = + Types.StructType.of(Types.NestedField.required(1, "a", Types.IntegerType.get())); + Types.StructType stringType = + Types.StructType.of(Types.NestedField.required(1, "a", Types.StringType.get())); + + PartitionData intData = new PartitionData(intType); + intData.set(0, 1); + + PartitionData stringData = new PartitionData(stringType); + stringData.set(0, "test"); + + StructLikeWrapper integerStruct = StructLikeWrapper.forType(intType).set(intData); + StructLikeWrapper stringStruct = StructLikeWrapper.forType(stringType).set(stringData); + + // StructLikeWrapper.equals previously threw an exception when the type and data mismatch + assertThat(integerStruct).isNotEqualTo(stringStruct); + } +} From 46c1101ad8f5e13c0efd5d991d8ce22c8d7e7de9 Mon Sep 17 00:00:00 2001 From: Barry <100205797+barry3406@users.noreply.github.com> Date: Thu, 16 Apr 2026 11:12:56 +0800 Subject: [PATCH 057/197] API: Fix FileRange validation to reject negative offset/length (#15926) * API: Fix FileRange validation to reject negative offset/length The constructor validated length() and offset() (getters) before assigning the constructor parameters to the fields. Since field defaults are 0, negative inputs bypassed validation silently. Validate the constructor parameters directly instead of the getters. Fixes #15922 * API: Add unit tests for FileRange constructor validation Verify that negative offset, negative length, and null byteBuffer are properly rejected by the constructor. * API: Use exact error messages in TestFileRange assertions Addresses review feedback to tighten assertions to exact messages. --- .../java/org/apache/iceberg/io/FileRange.java | 6 +- .../org/apache/iceberg/io/TestFileRange.java | 62 +++++++++++++++++++ 2 files changed, 64 insertions(+), 4 deletions(-) create mode 100644 api/src/test/java/org/apache/iceberg/io/TestFileRange.java diff --git a/api/src/main/java/org/apache/iceberg/io/FileRange.java b/api/src/main/java/org/apache/iceberg/io/FileRange.java index f6d5d9b41cca..695d516725a6 100644 --- a/api/src/main/java/org/apache/iceberg/io/FileRange.java +++ b/api/src/main/java/org/apache/iceberg/io/FileRange.java @@ -31,10 +31,8 @@ public class FileRange { public FileRange(CompletableFuture byteBuffer, long offset, int length) throws EOFException { Preconditions.checkNotNull(byteBuffer, "byteBuffer can't be null"); - Preconditions.checkArgument( - length() >= 0, "Invalid length: %s in range (must be >= 0)", length); - Preconditions.checkArgument( - offset() >= 0, "Invalid offset: %s in range (must be >= 0)", offset); + Preconditions.checkArgument(length >= 0, "Invalid length: %s in range (must be >= 0)", length); + Preconditions.checkArgument(offset >= 0, "Invalid offset: %s in range (must be >= 0)", offset); this.byteBuffer = byteBuffer; this.offset = offset; diff --git a/api/src/test/java/org/apache/iceberg/io/TestFileRange.java b/api/src/test/java/org/apache/iceberg/io/TestFileRange.java new file mode 100644 index 000000000000..dc4ede9ec3b4 --- /dev/null +++ b/api/src/test/java/org/apache/iceberg/io/TestFileRange.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.io; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.EOFException; +import java.nio.ByteBuffer; +import java.util.concurrent.CompletableFuture; +import org.junit.jupiter.api.Test; + +public class TestFileRange { + + @Test + public void validRange() throws EOFException { + CompletableFuture future = new CompletableFuture<>(); + FileRange range = new FileRange(future, 10L, 100); + assertThat(range.offset()).isEqualTo(10L); + assertThat(range.length()).isEqualTo(100); + assertThat(range.byteBuffer()).isSameAs(future); + } + + @Test + public void negativeLength() { + CompletableFuture future = new CompletableFuture<>(); + assertThatThrownBy(() -> new FileRange(future, 0L, -1)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid length: -1 in range (must be >= 0)"); + } + + @Test + public void negativeOffset() { + CompletableFuture future = new CompletableFuture<>(); + assertThatThrownBy(() -> new FileRange(future, -1L, 0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid offset: -1 in range (must be >= 0)"); + } + + @Test + public void nullByteBuffer() { + assertThatThrownBy(() -> new FileRange(null, 0L, 0)) + .isInstanceOf(NullPointerException.class) + .hasMessage("byteBuffer can't be null"); + } +} From 74acec7b3e108f6676243ad5eb8461f24a64c882 Mon Sep 17 00:00:00 2001 From: yadavay-amzn Date: Wed, 15 Apr 2026 20:14:01 -0700 Subject: [PATCH 058/197] Docs: Replace deprecated 'compile' with 'implementation' in Gradle snippet (#15921) The Gradle snippet on the Releases page used the 'compile' configuration, which was removed in Gradle 7. Updated to 'implementation' to match current Gradle conventions and Iceberg's own build.gradle. Closes #15811 Co-authored-by: Anupam Yadav --- site/docs/releases.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/docs/releases.md b/site/docs/releases.md index d3a5ed9a4904..bc8957942fc2 100644 --- a/site/docs/releases.md +++ b/site/docs/releases.md @@ -45,7 +45,7 @@ To add a dependency on Iceberg in Gradle, add the following to `build.gradle`: ``` dependencies { - compile 'org.apache.iceberg:iceberg-core:{{ icebergVersion }}' + implementation 'org.apache.iceberg:iceberg-core:{{ icebergVersion }}' } ``` From 1a6a4881012bbf367296f138685b9f8d80d800ee Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Thu, 16 Apr 2026 11:19:56 +0800 Subject: [PATCH 059/197] Build: Ignore `.githooks` (#15909) * Build: Ignore `.githooks` * Build: Ignore `.githooks` --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 7812e21f89e0..1f5a8efe37f5 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,6 @@ derby.log # sdkman .sdkmanrc + +# git hooks like pre-commit +.githooks/ From f0cf4de766a9377771eb3273d4e86d9b01b107c1 Mon Sep 17 00:00:00 2001 From: sanshi <43472713+lilei1128@users.noreply.github.com> Date: Thu, 16 Apr 2026 11:55:32 +0800 Subject: [PATCH 060/197] Docs: Document that positionDeleteWriteBuilder is for format-version 2 tables only (#15980) --- .../java/org/apache/iceberg/formats/FormatModelRegistry.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java index 4a6b5a6cf40f..e1e93aa1fd07 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java @@ -170,6 +170,11 @@ public static FileWriterBuilder, S> equalityDelet * records that identify rows to be deleted by file path and position, producing a {@link * DeleteFile} that can be used for table operations. * + *

Note: This method is only applicable to format-version 2 tables. Format-version 3 + * tables use deletion vectors, which are always written in Puffin format. Registered {@link + * FormatModel} implementations for {@link PositionDelete} are not consulted for format-version 3+ + * tables. + * * @param format the file format used for writing * @param outputFile destination for the written data * @return a configured delete write builder for creating a {@link PositionDeleteWriter} From f2ed6a9dba16e2a7e7015e92c60b3749a385dd75 Mon Sep 17 00:00:00 2001 From: Rulin Xing Date: Thu, 16 Apr 2026 02:35:35 -0700 Subject: [PATCH 061/197] AWS: Close custom AwsCredentialsProvider in RESTSigV4AuthSession (#15818) * Close custom AwsCredentialsProvider properly * Address comments --- .../iceberg/aws/RESTSigV4AuthSession.java | 16 ++++++- .../iceberg/aws/TestRESTSigV4AuthSession.java | 42 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java b/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java index 98808ead4f0b..48281841be37 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java +++ b/aws/src/main/java/org/apache/iceberg/aws/RESTSigV4AuthSession.java @@ -18,12 +18,15 @@ */ package org.apache.iceberg.aws; +import java.io.IOException; +import java.io.UncheckedIOException; import java.net.URI; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; +import org.apache.iceberg.io.CloseableGroup; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.rest.HTTPHeaders; import org.apache.iceberg.rest.HTTPHeaders.HTTPHeader; @@ -64,16 +67,23 @@ public class RESTSigV4AuthSession implements AuthSession { private final Region signingRegion; private final String signingName; private final AwsCredentialsProvider credentialsProvider; + private final CloseableGroup closeableGroup; @SuppressWarnings("deprecation") public RESTSigV4AuthSession( Aws4Signer aws4Signer, AuthSession delegateAuthSession, AwsProperties awsProperties) { + this.closeableGroup = new CloseableGroup(); + this.closeableGroup.setSuppressCloseFailure(true); this.signer = Preconditions.checkNotNull(aws4Signer, "Invalid signer: null"); this.delegate = Preconditions.checkNotNull(delegateAuthSession, "Invalid delegate: null"); + this.closeableGroup.addCloseable(this.delegate); Preconditions.checkNotNull(awsProperties, "Invalid AWS properties: null"); this.signingRegion = awsProperties.restSigningRegion(); this.signingName = awsProperties.restSigningName(); this.credentialsProvider = awsProperties.restCredentialsProvider(); + if (credentialsProvider instanceof AutoCloseable closeableCredentialsProvider) { + this.closeableGroup.addCloseable(closeableCredentialsProvider); + } } public AuthSession delegate() { @@ -87,7 +97,11 @@ public HTTPRequest authenticate(HTTPRequest request) { @Override public void close() { - delegate.close(); + try { + closeableGroup.close(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } } @SuppressWarnings("deprecation") diff --git a/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java b/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java index 1b2aaf2e1c01..9e996ca60089 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java +++ b/aws/src/test/java/org/apache/iceberg/aws/TestRESTSigV4AuthSession.java @@ -35,7 +35,10 @@ import org.apache.iceberg.rest.requests.CreateNamespaceRequest; import org.junit.jupiter.api.Test; import org.mockito.Mockito; +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider; import software.amazon.awssdk.auth.signer.Aws4Signer; +import software.amazon.awssdk.regions.Region; +import software.amazon.awssdk.utils.SdkAutoCloseable; class TestRESTSigV4AuthSession { @@ -306,4 +309,43 @@ void close() { session.close(); Mockito.verify(delegate).close(); } + + @Test + void closeWithCloseableCredentialsProvider() { + AuthSession delegate = Mockito.mock(AuthSession.class); + CloseableAwsCredentialsProvider credentialsProvider = + Mockito.mock(CloseableAwsCredentialsProvider.class); + closeWithCloseableCredentialsProvider(delegate, credentialsProvider); + } + + @Test + void closeSuppressesFailure() { + AuthSession delegate = Mockito.mock(AuthSession.class); + Mockito.doThrow(new RuntimeException("delegate close failed")).when(delegate).close(); + CloseableAwsCredentialsProvider credentialsProvider = + Mockito.mock(CloseableAwsCredentialsProvider.class); + Mockito.doThrow(new RuntimeException("credentials provider close failed")) + .when(credentialsProvider) + .close(); + closeWithCloseableCredentialsProvider(delegate, credentialsProvider); + } + + private void closeWithCloseableCredentialsProvider( + AuthSession delegate, CloseableAwsCredentialsProvider credentialsProvider) { + AwsProperties properties = Mockito.mock(AwsProperties.class); + when(properties.restSigningRegion()).thenReturn(Region.US_WEST_2); + when(properties.restSigningName()).thenReturn("execute-api"); + when(properties.restCredentialsProvider()).thenReturn(credentialsProvider); + + RESTSigV4AuthSession session = new RESTSigV4AuthSession(signer, delegate, properties); + session.close(); + + Mockito.verify(delegate).close(); + Mockito.verify(credentialsProvider).close(); + } + + interface CloseableAwsCredentialsProvider extends AwsCredentialsProvider, SdkAutoCloseable { + @Override + void close(); + } } From dde712ec9ed6c9d28183ee4615d50f97b246af5d Mon Sep 17 00:00:00 2001 From: GuoYu <511955993@qq.com> Date: Thu, 16 Apr 2026 19:28:57 +0800 Subject: [PATCH 062/197] Data: Clean engineProjection in BaseFormatModelTests (#15995) --- .../iceberg/data/BaseFormatModelTests.java | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java index e295b5fbc1bb..28034933a8f3 100644 --- a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java +++ b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java @@ -134,12 +134,7 @@ void testDataWriterEngineWriteGenericRead(FileFormat fileFormat, DataGenerator d FileWriterBuilder, Object> writerBuilder = FormatModelRegistry.dataWriteBuilder(fileFormat, engineType(), encryptedFile); - DataWriter writer = - writerBuilder - .schema(schema) - .engineSchema(engineSchema(schema)) - .spec(PartitionSpec.unpartitioned()) - .build(); + DataWriter writer = writerBuilder.schema(schema).spec(PartitionSpec.unpartitioned()).build(); List genericRecords = dataGenerator.generateRecords(); List engineRecords = convertToEngineRecords(genericRecords, schema); @@ -219,7 +214,6 @@ void testEqualityDeleteWriterEngineWriteGenericRead( EqualityDeleteWriter writer = writerBuilder .schema(schema) - .engineSchema(engineSchema(schema)) .spec(PartitionSpec.unpartitioned()) .equalityFieldIds(1) .build(); @@ -383,7 +377,6 @@ void testReaderBuilderProjection(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(projectedSchema) - .engineProjection(engineSchema(projectedSchema)) .build()) { readRecords = ImmutableList.copyOf(reader); } @@ -398,9 +391,6 @@ void testReaderBuilderFilter(FileFormat fileFormat) throws IOException { assumeSupports(fileFormat, FEATURE_FILTER); Schema schema = SCHEMA; - new Schema( - Types.NestedField.required(1, "id", Types.IntegerType.get()), - Types.NestedField.required(2, "data", Types.StringType.get())); // Generate records with known id values [0, count) int count = 10000; @@ -419,7 +409,6 @@ void testReaderBuilderFilter(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(lessThanFilter) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -433,7 +422,6 @@ void testReaderBuilderFilter(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(greaterThanFilter) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -471,7 +459,6 @@ void testReaderBuilderCaseSensitive(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(upperCaseFilter) .caseSensitive(false) .build()) { @@ -486,7 +473,6 @@ void testReaderBuilderCaseSensitive(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .filter(upperCaseFilter) .caseSensitive(true) .build()) { @@ -529,7 +515,6 @@ void testReaderBuilderSplit(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .split(firstSplitStart, firstSplitLength) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -542,7 +527,6 @@ void testReaderBuilderSplit(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .split(fileLength, 0) .build()) { emptyReadRecords = ImmutableList.copyOf(reader); @@ -554,7 +538,6 @@ void testReaderBuilderSplit(FileFormat fileFormat) throws IOException { try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .split(0, fileLength) .build()) { readRecords = ImmutableList.copyOf(reader); @@ -584,7 +567,6 @@ void testReaderBuilderReuseContainers(FileFormat fileFormat) throws IOException try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .build()) { noReuseRecords = ImmutableList.copyOf(reader); } @@ -600,7 +582,6 @@ void testReaderBuilderReuseContainers(FileFormat fileFormat) throws IOException try (CloseableIterable reader = FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) .project(schema) - .engineProjection(engineSchema(schema)) .reuseContainers() .build()) { reuseRecords = ImmutableList.copyOf(reader); From 7897d57fbe6c9c52de93195debbd24ebb41fc163 Mon Sep 17 00:00:00 2001 From: Han You Date: Fri, 17 Apr 2026 04:37:33 -0500 Subject: [PATCH 063/197] Flink: Add passthroughRecords option to DynamicIcebergSink (#15433) Co-authored-by: Han You Co-authored-by: Jordan Epstein --- docs/docs/flink-writes.md | 26 ++- .../sink/dynamic/DynamicIcebergSink.java | 132 ++++++++++++-- .../flink/sink/dynamic/DynamicRecord.java | 26 ++- .../sink/dynamic/DynamicRecordProcessor.java | 58 ++++-- .../sink/dynamic/TestDynamicIcebergSink.java | 165 +++++++++++++++++- 5 files changed, 361 insertions(+), 46 deletions(-) diff --git a/docs/docs/flink-writes.md b/docs/docs/flink-writes.md index 3fef3a1bf3bf..09fa22b640c7 100644 --- a/docs/docs/flink-writes.md +++ b/docs/docs/flink-writes.md @@ -483,7 +483,7 @@ We need the following information (DynamicRecord) for every record: | `Schema` | The schema of the record. | | `Spec` | The expected partitioning specification for the record. | | `RowData` | The actual row data to be written. | -| `DistributionMode` | The distribution mode for writing the record (currently supports NONE or HASH). | +| `DistributionMode` | The distribution mode for writing the record (NONE, HASH or `null`). When `null`, the record won't be shuffled at all. | | `Parallelism` | The maximum number of parallel writers for a given table/branch/schema/spec (WriteTarget). | | `UpsertMode` | Overrides this table's write.upsert.enabled (optional). | | `EqualityFields` | The equality fields for the table(optional). | @@ -547,6 +547,30 @@ The Dynamic Iceberg Flink Sink is configured using the Builder pattern. Here are | `tableCreator(TableCreator creator)` | When DynamicIcebergSink creates new Iceberg tables, allows overriding how tables are created - setting custom table properties and location based on the table name. | | `dropUnusedColumns(boolean enabled)` | When enabled, drops all columns from the current table schema which are not contained in the input schema (see the caveats above on dropping columns). | +### Distribution Modes + +The `DistributionMode` set on each `DynamicRecord` controls how that record is routed from the processor to the writer: + +| Mode | Behavior | +|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `NONE` | Records are distributed across writer subtasks in a round-robin fashion (or by equality fields if set). | +| `HASH` | Records are distributed by partition key (partitioned tables) or equality fields (unpartitioned tables). Ensures that records for the same partition are handled by the same writer subtask. | +| `null` | Forward mode: bypasses distribution entirely and sends records directly via a forward edge (see below). | + +#### Forward Mode + +Using the `DynamicRecord` constructor overload without `distributionMode` parameter bypasses distribution entirely. This is designed for high-throughput pipelines where every partition already has a large volume of data and the serialization and network shuffle cost is prohibitive. Records are sent directly from the processor to the writer using a forward edge, enabling Flink operator chaining. Table metadata updates are always performed immediately inside the processor (regardless of `immediateTableUpdate` setting), because a dedicated table-update operator was deliberately omitted to avoid introducing extra data shuffles. + +Forward and regular records can be mixed in the same pipeline. The processor routes records to two separate sink outputs: + +- **Shuffle sink**: receives shuffling records. These go through the normal distribution topology (hash/round-robin) before reaching the writer. +- **Forward sink**: receives records without a `distributionMode`. These skip distribution entirely and flow via a forward edge from the processor, allowing Flink operator chaining. Suited for high-throughput tables where avoiding shuffle overhead is critical. The sink's `writeParallelism` config does not apply to this path. + +!!! warning + +1. In the forward path, schema changes are always applied immediately because records must pass straight through via the forward edge. For the intended high-volume use case, this can cause many conflicting commits to the Iceberg catalog and temporarily delay data processing. Consider either updating the schema externally before publishing records with the new schema, or planning for a temporary disruption in throughput when a new schema is introduced from upstream. +2. Because the forward path skips distribution entirely, users are responsible for distributing the data correctly in the upstream before the records reach the dynamic Iceberg sink. Otherwise, writes could be unbalanced. + ### Notes - **Range distribution mode**: Currently, the dynamic sink does not support the `RANGE` distribution mode, if set, it will fall back to `HASH`. diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 4b5c9bef41e1..6f5fb945a165 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -44,6 +44,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.runtime.operators.sink.SinkWriterOperatorFactory; import org.apache.flink.table.data.RowData; import org.apache.flink.util.OutputTag; import org.apache.iceberg.Table; @@ -79,13 +80,17 @@ public class DynamicIcebergSink private final Configuration flinkConfig; private final int cacheMaximumSize; + // Set by the builder before sinkTo() — forward writer results to union into pre-commit topology + private final transient DataStream> forwardWriteResults; + DynamicIcebergSink( CatalogLoader catalogLoader, Map snapshotProperties, String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWriteResults) { this.catalogLoader = catalogLoader; this.snapshotProperties = snapshotProperties; this.uidPrefix = uidPrefix; @@ -96,6 +101,7 @@ public class DynamicIcebergSink // This is used to separate files generated by different sinks writing the same table. // Also used to generate the aggregator operator name this.sinkId = UUID.randomUUID().toString(); + this.forwardWriteResults = forwardWriteResults; } @Override @@ -144,7 +150,11 @@ public DataStream> addPreCommitTopology( TypeInformation> typeInformation = CommittableMessageTypeInfo.of(this::getCommittableSerializer); - return writeResults + // Union forward writer results with the shuffle writer results + DataStream> allResults = + writeResults.union(forwardWriteResults); + + return allResults .keyBy( committable -> { if (committable instanceof CommittableSummary) { @@ -167,6 +177,55 @@ public SimpleVersionedSerializer getWriteResultSerializer() return new DynamicWriteResultSerializer(); } + /** + * A lightweight Sink used with {@link SinkWriterOperatorFactory} for the forward write path. + * Implements {@link SupportsCommitter} so that {@code SinkWriterOperator} emits committables + * downstream. The committer is never called — committing is handled by the main sink. + */ + @VisibleForTesting + static class ForwardWriterSink + implements Sink, SupportsCommitter { + + private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; + private final int cacheMaximumSize; + + ForwardWriterSink( + CatalogLoader catalogLoader, + Map writeProperties, + Configuration flinkConfig, + int cacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.writeProperties = writeProperties; + this.flinkConfig = flinkConfig; + this.cacheMaximumSize = cacheMaximumSize; + } + + @Override + public SinkWriter createWriter(WriterInitContext context) { + return new DynamicWriter( + catalogLoader.loadCatalog(), + writeProperties, + flinkConfig, + cacheMaximumSize, + new DynamicWriterMetrics(context.metricGroup()), + context.getTaskInfo().getIndexOfThisSubtask(), + context.getTaskInfo().getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + throw new UnsupportedOperationException( + "WriterSink is used only for writing; committing is handled by the main sink"); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new DynamicWriteResultSerializer(); + } + } + public static class Builder { private DataStream input; private DynamicRecordGenerator generator; @@ -357,43 +416,79 @@ private String operatorName(String suffix) { return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; } - private DynamicIcebergSink build() { + private DynamicIcebergSink build( + SingleOutputStreamOperator converted, + DynamicRecordInternalType sideOutputType) { Preconditions.checkArgument( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); - Configuration flinkConfig = readableConfig instanceof Configuration ? (Configuration) readableConfig : Configuration.fromMap(readableConfig.toMap()); - return instantiateSink(writeOptions, flinkConfig); + // Forward writer: chained with generator via forward edge, no data shuffle + ForwardWriterSink forwardWriterSink = + new ForwardWriterSink(catalogLoader, writeOptions, flinkConfig, cacheMaximumSize); + TypeInformation> writeResultTypeInfo = + CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); + + DataStream> forwardWriteResults = + converted + .getSideOutput( + new OutputTag<>(DynamicRecordProcessor.DYNAMIC_FORWARD_STREAM, sideOutputType)) + .transform( + operatorName("Forward-Writer"), + writeResultTypeInfo, + new SinkWriterOperatorFactory<>(forwardWriterSink)) + .uid(prefixIfNotNull(uidPrefix, "-forward-writer")); + + // Inject forward write results into sink — they'll be unioned in addPreCommitTopology + return instantiateSink(writeOptions, flinkConfig, forwardWriteResults); } @VisibleForTesting DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkWriteConf) { + Map writeProperties, + Configuration flinkWriteConf, + DataStream> forwardWriteResults) { return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize); + cacheMaximumSize, + forwardWriteResults); } /** * Append the iceberg sink operators to write records to iceberg table. * + *

The topology splits records by distribution mode: + * + *

    + *
  • Forward records ({@code null} distributionMode) go through a forward edge to a chained + * writer, avoiding any data shuffle. + *
  • Shuffle records (non-null distributionMode) go through the standard Sink2 pipeline with + * hash/round-robin distribution. + *
+ * + * Both writers feed into a single shared pre-commit aggregator and committer, ensuring atomic + * commits across both paths. + * * @return {@link DataStreamSink} for sink. */ public DataStreamSink append() { + uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + DynamicRecordInternalType type = new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); - DynamicIcebergSink sink = build(); + DynamicRecordInternalType sideOutputType = + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize); + SingleOutputStreamOperator converted = input .process( @@ -411,12 +506,14 @@ public DataStreamSink append() { .name(operatorName("generator")) .returns(type); - DataStreamSink rowDataDataStreamSink = + DynamicIcebergSink sink = build(converted, sideOutputType); + + // Shuffle path: table update side output + main output → sinkTo() + DataStream shuffleInput = converted .getSideOutput( new OutputTag<>( - DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) + DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( new DynamicTableUpdateOperator( @@ -430,16 +527,19 @@ public DataStreamSink append() { .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) - .union(converted) - .sinkTo(sink) + .union(converted); + + DataStreamSink result = + shuffleInput + .sinkTo(sink) // Forward write results are implicitly injected here .uid(prefixIfNotNull(uidPrefix, "-sink")); FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); if (flinkWriteConf.writeParallelism() != null) { - rowDataDataStreamSink.setParallelism(flinkWriteConf.writeParallelism()); + result.setParallelism(flinkWriteConf.writeParallelism()); } - return rowDataDataStreamSink; + return result; } } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 9f445766083e..15b83a589382 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -34,20 +34,40 @@ public class DynamicRecord { private Schema schema; private RowData rowData; private PartitionSpec partitionSpec; - private DistributionMode distributionMode; + @Nullable private DistributionMode distributionMode; private int writeParallelism; private boolean upsertMode; @Nullable private Set equalityFields; /** - * Constructs a new DynamicRecord. + * Constructs a new DynamicRecord with forward (no shuffle) writes. * * @param tableIdentifier The target table identifier. * @param branch The target table branch. * @param schema The target table schema. * @param rowData The data matching the provided schema. * @param partitionSpec The target table {@link PartitionSpec}. - * @param distributionMode The {@link DistributionMode}. + */ + public DynamicRecord( + TableIdentifier tableIdentifier, + String branch, + Schema schema, + RowData rowData, + PartitionSpec partitionSpec) { + this(tableIdentifier, branch, schema, rowData, partitionSpec, null, -1); + } + + /** + * Constructs a new DynamicRecord. This record will be shuffled as specified by {@code + * distributionMode}. + * + * @param tableIdentifier The target table identifier. + * @param branch The target table branch. + * @param schema The target table schema. + * @param rowData The data matching the provided schema. + * @param partitionSpec The target table {@link PartitionSpec}. + * @param distributionMode The {@link DistributionMode}. {@code null} indicates forward (no + * shuffle) writes. * @param writeParallelism The number of parallel writers. Can be set to any value {@literal > 0}, * but will always be automatically capped by the maximum write parallelism, which is the * parallelism of the sink. Set to Integer.MAX_VALUE for always using the maximum available diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index 07dfad2780f7..fc6892b2cd9e 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -37,6 +37,8 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; private final boolean immediateUpdate; @@ -51,6 +53,7 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; + private transient OutputTag forwardStream; private transient Collector collector; private transient Context context; @@ -90,9 +93,14 @@ public void open(OpenContext openContext) throws Exception { this.hashKeyGenerator = new HashKeyGenerator( cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); - if (immediateUpdate) { - updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); - } else { + // Always create updater — needed for forced immediate updates on forward records + this.updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); + // Always create forward stream tag for forward (distributionMode == null) records + this.forwardStream = + new OutputTag<>( + DYNAMIC_FORWARD_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; + if (!immediateUpdate) { updateStream = new OutputTag<>( DYNAMIC_TABLE_UPDATE_STREAM, @@ -112,6 +120,8 @@ public void processElement(T element, Context ctx, Collector newData = updater.update( data.tableIdentifier(), data.branch(), data.schema(), data.spec(), tableCreator); emit( - collector, data, newData.f0.resolvedTableSchema(), newData.f0.recordConverter(), - newData.f1); + newData.f1, + isForward); } else { + // Shuffled records with immediateUpdate=false go to the update side output int writerKey = hashKeyGenerator.generateKey( data, @@ -159,33 +174,38 @@ public void collect(DynamicRecord data) { } } else { emit( - collector, data, foundSchema.resolvedTableSchema(), foundSchema.recordConverter(), - foundSpec); + foundSpec, + isForward); } } private void emit( - Collector out, DynamicRecord data, Schema schema, DataConverter recordConverter, - PartitionSpec spec) { + PartitionSpec spec, + boolean forward) { RowData rowData = (RowData) recordConverter.convert(data.rowData()); - int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); - String tableName = data.tableIdentifier().toString(); - out.collect( + // writerKey is unused in the forward path. + int writerKey = forward ? -1 : hashKeyGenerator.generateKey(data, schema, spec, rowData); + DynamicRecordInternal record = new DynamicRecordInternal( - tableName, + data.tableIdentifier().toString(), data.branch(), schema, rowData, spec, writerKey, data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema)); + if (forward) { + context.output(forwardStream, record); + } else { + collector.collect(record); + } } @Override diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 27b1e3d84a8c..2a46f8021cca 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -43,7 +43,10 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.RestartStrategyOptions; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.OperatorIDPair; import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -213,6 +216,56 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { } } + /** Generator that always emits forward (null distributionMode) records. */ + private static class ForwardGenerator implements DynamicRecordGenerator { + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** + * Generator that alternates between forward (null distributionMode) and shuffle records. Even + * indices go forward, odd indices go through shuffle. + */ + private static class MixedGenerator implements DynamicRecordGenerator { + private int count = 0; + + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + boolean forward = (count++ % 2 == 0); + DistributionMode mode = + forward ? null : (spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE); + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec, + mode, + 10); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + private static DataFormatConverters.RowConverter converter(Schema schema) { RowType rowType = FlinkSchemaUtil.convert(schema); ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(rowType); @@ -238,6 +291,96 @@ void testWrite() throws Exception { runTest(rows); } + @Test + void testNoShuffleTopology() throws Exception { + DataStream dataStream = + env.fromData( + Collections.emptyList(), TypeInformation.of(new TypeHint() {})); + DynamicIcebergSink.forInput(dataStream) + .generator(new ForwardGenerator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(2) + .immediateTableUpdate(false) + .overwrite(false) + .append(); + + boolean generatorAndSinkChained = false; + for (JobVertex vertex : env.getStreamGraph().getJobGraph().getVertices()) { + boolean generatorInThisVertex = false; + boolean sinkInThisVertex = false; + for (OperatorIDPair operatorID : vertex.getOperatorIDs()) { + String uid = operatorID.getUserDefinedOperatorUid(); + if (uid == null) { + continue; + } + + if (uid.endsWith("-forward-writer")) { + sinkInThisVertex = true; + } else if (uid.endsWith("-generator")) { + generatorInThisVertex = true; + } + } + + generatorAndSinkChained = generatorInThisVertex && sinkInThisVertex; + if (generatorAndSinkChained) { + break; + } + } + + assertThat(generatorAndSinkChained).isTrue(); + } + + @Test + void testForwardWrite() throws Exception { + runForwardWriteTest(new ForwardGenerator()); + } + + @Test + void testMixedForwardAndShuffleWrite() throws Exception { + runForwardWriteTest(new MixedGenerator()); + } + + private void runForwardWriteTest(DynamicRecordGenerator generator) + throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned())); + + DataStream dataStream = + env.fromData(rows, TypeInformation.of(new TypeHint<>() {})); + env.setParallelism(1); + + DynamicIcebergSink.forInput(dataStream) + .generator(generator) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(1) + .immediateTableUpdate(true) + .append(); + + env.execute(); + + verifyResults(rows); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1170,8 +1313,9 @@ void testOperatorUidsFormat() { // pre commit topology was off, but since it is stateless, users will still be able to restore // state, but we must keep the stateful operators UUIds like the committer consistent. assertThat(sinkUids) - .contains( + .containsOnly( "test--sink", + "test--forward-writer", "test--generator", "test--updater", "test--sink: test--pre-commit-topology", @@ -1179,8 +1323,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(""); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1188,8 +1333,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(null); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1359,7 +1505,9 @@ static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.B @Override DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkConfig) { + Map writeProperties, + Configuration flinkConfig, + DataStream> forwardWriteResults) { return new CommitHookDynamicIcebergSink( commitHook, CATALOG_EXTENSION.catalogLoader(), @@ -1367,7 +1515,8 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100); + 100, + forwardWriteResults); } } @@ -1383,14 +1532,16 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWritten) { super( catalogLoader, snapshotProperties, uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize); + cacheMaximumSize, + forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); } From ecbe8a84b1897015bdaf0e7c3fdcbb8575aa9971 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 17 Apr 2026 04:19:06 -0700 Subject: [PATCH 064/197] Build: set zizmor min-severity and min-confidence to medium (#16001) --- .github/workflows/api-binary-compatibility.yml | 2 +- .github/workflows/delta-conversion-ci.yml | 4 ++-- .github/workflows/flink-ci.yml | 2 +- .github/workflows/hive-ci.yml | 2 +- .github/workflows/java-ci.yml | 8 ++++---- .github/workflows/jmh-benchmarks.yml | 2 +- .github/workflows/kafka-connect-ci.yml | 2 +- .github/workflows/publish-iceberg-rest-fixture-docker.yml | 2 +- .github/workflows/publish-snapshot.yml | 2 +- .github/workflows/recurring-jmh-benchmarks.yml | 2 +- .github/workflows/spark-ci.yml | 2 +- .github/workflows/zizmor.yml | 2 ++ 12 files changed, 17 insertions(+), 15 deletions(-) diff --git a/.github/workflows/api-binary-compatibility.yml b/.github/workflows/api-binary-compatibility.yml index d91ba210ab61..8ad0ebd26f0e 100644 --- a/.github/workflows/api-binary-compatibility.yml +++ b/.github/workflows/api-binary-compatibility.yml @@ -59,7 +59,7 @@ jobs: with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: | echo "Using the old version tag, as per git describe, of $(git describe)"; - run: ./gradlew revapi --rerun-tasks diff --git a/.github/workflows/delta-conversion-ci.yml b/.github/workflows/delta-conversion-ci.yml index aac791ab8b1e..82442ac041f4 100644 --- a/.github/workflows/delta-conversion-ci.yml +++ b/.github/workflows/delta-conversion-ci.yml @@ -87,7 +87,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.12 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -113,7 +113,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.13 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index 3f346b21846e..5479503179fc 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -91,7 +91,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DkafkaVersions= -DflinkVersions=${{ matrix.flink }} :iceberg-flink:iceberg-flink-${{ matrix.flink }}:check :iceberg-flink:iceberg-flink-runtime-${{ matrix.flink }}:check -Pquick=true -x javadoc -DtestParallelism=auto - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/hive-ci.yml b/.github/workflows/hive-ci.yml index 3f9adf2dee9f..084ce9f4fcf4 100644 --- a/.github/workflows/hive-ci.yml +++ b/.github/workflows/hive-ci.yml @@ -88,7 +88,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true :iceberg-mr:check -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index 5cc8b198a150..fbcabdb2f32e 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -83,7 +83,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew check -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -107,7 +107,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: ./gradlew -DallModules build -x test -x javadoc -x integrationTest build-javadoc: @@ -124,7 +124,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: ./gradlew -Pquick=true javadoc check-runtime-deps: @@ -137,5 +137,5 @@ jobs: with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: ./gradlew checkAllRuntimeDeps -q diff --git a/.github/workflows/jmh-benchmarks.yml b/.github/workflows/jmh-benchmarks.yml index 9b01352d9aaa..6dbd3a6958fd 100644 --- a/.github/workflows/jmh-benchmarks.yml +++ b/.github/workflows/jmh-benchmarks.yml @@ -103,7 +103,7 @@ jobs: with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - name: Run Benchmark diff --git a/.github/workflows/kafka-connect-ci.yml b/.github/workflows/kafka-connect-ci.yml index c0490fd6981c..7eaa042990ad 100644 --- a/.github/workflows/kafka-connect-ci.yml +++ b/.github/workflows/kafka-connect-ci.yml @@ -88,7 +88,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: | ./gradlew -DsparkVersions= -DflinkVersions= -DkafkaVersions=3 \ diff --git a/.github/workflows/publish-iceberg-rest-fixture-docker.yml b/.github/workflows/publish-iceberg-rest-fixture-docker.yml index 8691a67e29e8..6976e11ac44f 100644 --- a/.github/workflows/publish-iceberg-rest-fixture-docker.yml +++ b/.github/workflows/publish-iceberg-rest-fixture-docker.yml @@ -48,7 +48,7 @@ jobs: with: distribution: zulu java-version: 21 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - name: Build Iceberg Open API project run: ./gradlew :iceberg-open-api:shadowJar - name: Login to Docker Hub diff --git a/.github/workflows/publish-snapshot.yml b/.github/workflows/publish-snapshot.yml index dac63bb9fa2e..1cbe5c706279 100644 --- a/.github/workflows/publish-snapshot.yml +++ b/.github/workflows/publish-snapshot.yml @@ -43,7 +43,7 @@ jobs: with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - env: NEXUS_USER: ${{ secrets.NEXUS_USER }} NEXUS_PW: ${{ secrets.NEXUS_PW }} diff --git a/.github/workflows/recurring-jmh-benchmarks.yml b/.github/workflows/recurring-jmh-benchmarks.yml index 4ca900746289..da2e7d60325c 100644 --- a/.github/workflows/recurring-jmh-benchmarks.yml +++ b/.github/workflows/recurring-jmh-benchmarks.yml @@ -58,7 +58,7 @@ jobs: with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - name: Run Benchmark diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index fd55efba97c5..e9d77308f580 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -98,7 +98,7 @@ jobs: with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 # zizmor: ignore[cache-poisoning] -- cache writes are restricted to the default branch by setup-gradle + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 with: tool-cache: false diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml index 313835fcbe16..02d49c5473ab 100644 --- a/.github/workflows/zizmor.yml +++ b/.github/workflows/zizmor.yml @@ -42,3 +42,5 @@ jobs: uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2 with: advanced-security: false + min-severity: medium + min-confidence: medium From 53fde56a87d49517a9688fe73600eca10def1283 Mon Sep 17 00:00:00 2001 From: Shohei Okumiya Date: Sat, 18 Apr 2026 02:02:45 +0900 Subject: [PATCH 065/197] Docs: Add Apache Hive 4.2 to website (#15998) --- docs/docs/hive.md | 4 ++-- site/docs/hive-quickstart.md | 11 +++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/docs/hive.md b/docs/docs/hive.md index 0531e9b04a15..4829acfe208b 100644 --- a/docs/docs/hive.md +++ b/docs/docs/hive.md @@ -71,9 +71,9 @@ Starting from 1.8.0 Iceberg doesn't release Hive runtime connector. For Hive que with Hive 2.x and 3.x) use Hive runtime connector coming with Iceberg 1.6.1, or use Hive 4.0.0 or later which is released with embedded Iceberg integration. -### Hive 4.1.x +### Hive 4.1.x, 4.2.x -Hive 4.1.x comes with Iceberg 1.9.1 included. +Hive 4.1.x and 4.2.x come with Iceberg 1.9.1 included. ### Hive 4.0.x diff --git a/site/docs/hive-quickstart.md b/site/docs/hive-quickstart.md index 988664d9007a..fbf9dec53689 100644 --- a/site/docs/hive-quickstart.md +++ b/site/docs/hive-quickstart.md @@ -36,13 +36,12 @@ Take a look at the Tags tab in [Apache Hive docker images](https://hub.docker.co Set the version variable. ```sh -export HIVE_VERSION=4.0.0 +export HIVE_VERSION=4.2.0 ``` -To accommodate both Intel-based (x86_64) and Apple Silicon (M1, M2, M3) Macs when running your Docker container, you can use the --platform flag to specify the desired architecture. Apple Silicon Macs use the arm64 architecture, while Intel Macs use the amd64 architecture. -Start the container, using the option `--platform linux/arm64` for a Mac with an M-Series chip: +Start the HiveServer2 container: ```sh -docker run -d --platform linux/arm64 -p 10000:10000 -p 10002:10002 --env SERVICE_NAME=hiveserver2 --name hive4 apache/hive:${HIVE_VERSION} +docker run -d -p 10000:10000 -p 10002:10002 --env SERVICE_NAME=hiveserver2 --name hive4 apache/hive:${HIVE_VERSION} ``` The docker run command above configures Hive to use the embedded derby database for Hive Metastore. Hive Metastore functions as the Iceberg catalog to locate Iceberg files, which can be anywhere. @@ -106,8 +105,8 @@ SELECT * FROM nyc.taxis; #### Adding Iceberg to Hive -If you already have a Hive 4.0.0 or later environment, it comes with the Iceberg 1.4.3 included. No additional downloads or jars are needed. If you have a Hive 2.3.x or Hive 3.1.x environment see [Enabling Iceberg support in Hive](docs/latest/hive.md#hive-23x-hive-31x). +If you already have a Hive 4.0.0 or later environment, it comes with the Iceberg included. No additional downloads or jars are needed. If you have a Hive 2.3.x or Hive 3.1.x environment see [Enabling Iceberg support in Hive](docs/latest/hive.md#hive-23x-hive-31x). #### Learn More -To learn more about setting up a database other than Derby, see [Apache Hive Quick Start](https://hive.apache.org/developement/quickstart/). You can also [set up a standalone metastore, HS2 and Postgres](https://github.com/apache/hive/blob/master/packaging/src/docker/docker-compose.yml). Now that you're up and running with Iceberg and Hive, check out the [Iceberg-Hive docs](docs/latest/hive.md) to learn more! +To learn more about setting up a database other than Derby, see [Apache Hive Quick Start](https://hive.apache.org/developement/quickstart/). You can also [set up a standalone metastore, HS2 and Postgres](https://github.com/apache/hive/blob/master/packaging/src/docker/docker-compose.yml) or [use Hive Metastore as Iceberg REST Catalog](https://hive.apache.org/docs/latest/admin/iceberg-rest-catalog/). Now that you're up and running with Iceberg and Hive, check out the [Iceberg-Hive docs](docs/latest/hive.md) to learn more! From 8f1f483985bfbef3c19472b9e2ec474593fee219 Mon Sep 17 00:00:00 2001 From: Sachin Ranjalkar <52783123+sachinnn99@users.noreply.github.com> Date: Fri, 17 Apr 2026 23:28:06 +0530 Subject: [PATCH 066/197] Flink: Set generator parallelism to match input in DynamicIcebergSink (#15849) --- .../sink/dynamic/DynamicIcebergSink.java | 1 + .../sink/dynamic/TestDynamicIcebergSink.java | 27 +++++++++++++++++++ .../sink/dynamic/DynamicIcebergSink.java | 1 + .../sink/dynamic/TestDynamicIcebergSink.java | 27 +++++++++++++++++++ .../sink/dynamic/DynamicIcebergSink.java | 1 + .../sink/dynamic/TestDynamicIcebergSink.java | 27 +++++++++++++++++++ 6 files changed, 84 insertions(+) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 218fa2d911c8..63f5e6191193 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -408,6 +408,7 @@ public DataStreamSink append() { tableCreator, caseSensitive, dropUnusedColumns)) + .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) .returns(type); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 27b1e3d84a8c..aa92ae8ceb34 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -1196,6 +1196,33 @@ void testOperatorUidsFormat() { "Sink Committer: --sink"); } + @Test + void testGeneratorDefaultParallelism() { + StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + streamEnv.setParallelism(4); + + DataStreamSource source = + streamEnv.fromData(Collections.emptySet(), TypeInformation.of(new TypeHint<>() {})); + source.setParallelism(8); + + DynamicIcebergSink.forInput(source) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .uidPrefix("test") + .append(); + + // Since the generator parallelism is not directly accessible via the returned DataStreamSink, + // inspect the stream graph to verify the generator inherits the input source parallelism. + int generatorParallelism = + streamEnv.getStreamGraph().getStreamNodes().stream() + .filter(node -> "test--generator".equals(node.getTransformationUID())) + .findFirst() + .map(StreamNode::getParallelism) + .orElseThrow(() -> new AssertionError("Generator node not found")); + + assertThat(generatorParallelism).isEqualTo(source.getParallelism()); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 4b5c9bef41e1..a00eba492a19 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -407,6 +407,7 @@ public DataStreamSink append() { tableCreator, caseSensitive, dropUnusedColumns)) + .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) .returns(type); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 27b1e3d84a8c..aa92ae8ceb34 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -1196,6 +1196,33 @@ void testOperatorUidsFormat() { "Sink Committer: --sink"); } + @Test + void testGeneratorDefaultParallelism() { + StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + streamEnv.setParallelism(4); + + DataStreamSource source = + streamEnv.fromData(Collections.emptySet(), TypeInformation.of(new TypeHint<>() {})); + source.setParallelism(8); + + DynamicIcebergSink.forInput(source) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .uidPrefix("test") + .append(); + + // Since the generator parallelism is not directly accessible via the returned DataStreamSink, + // inspect the stream graph to verify the generator inherits the input source parallelism. + int generatorParallelism = + streamEnv.getStreamGraph().getStreamNodes().stream() + .filter(node -> "test--generator".equals(node.getTransformationUID())) + .findFirst() + .map(StreamNode::getParallelism) + .orElseThrow(() -> new AssertionError("Generator node not found")); + + assertThat(generatorParallelism).isEqualTo(source.getParallelism()); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 6f5fb945a165..9c8a4d156ff9 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -502,6 +502,7 @@ public DataStreamSink append() { tableCreator, caseSensitive, dropUnusedColumns)) + .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) .returns(type); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 2a46f8021cca..4e7511501014 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -1342,6 +1342,33 @@ void testOperatorUidsFormat() { "Sink Committer: --sink"); } + @Test + void testGeneratorDefaultParallelism() { + StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); + streamEnv.setParallelism(4); + + DataStreamSource source = + streamEnv.fromData(Collections.emptySet(), TypeInformation.of(new TypeHint<>() {})); + source.setParallelism(8); + + DynamicIcebergSink.forInput(source) + .generator(new Generator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .uidPrefix("test") + .append(); + + // Since the generator parallelism is not directly accessible via the returned DataStreamSink, + // inspect the stream graph to verify the generator inherits the input source parallelism. + int generatorParallelism = + streamEnv.getStreamGraph().getStreamNodes().stream() + .filter(node -> "test--generator".equals(node.getTransformationUID())) + .findFirst() + .map(StreamNode::getParallelism) + .orElseThrow(() -> new AssertionError("Generator node not found")); + + assertThat(generatorParallelism).isEqualTo(source.getParallelism()); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); From 69d7bfacebc044ad9b36e21f5cee2bd12d2da204 Mon Sep 17 00:00:00 2001 From: Tanmay Rauth Date: Sat, 18 Apr 2026 12:44:46 -0700 Subject: [PATCH 067/197] Docs: Sync Go implementation status with iceberg-go (#16021) * Docs: Sync Go implementation status with iceberg-go Update the Go column in status.md to reflect the current state of the iceberg-go library based on source code verification. * Docs: Address review comments for Go status updates Update additional Go feature flags based on reviewer feedback from zeroshade and laskoviymishka with source code references: - Update schema (V1+V2): transaction.go:177 - Update partition spec (V1+V2): transaction.go:160 - Replace sort order (V1+V2): metadata.go:532 - Update table location (V1+V2): updates.go:376 - Expire snapshots (V1+V2): transaction.go:212 - Manage snapshots (V1+V2): metadata.go:753 - Rewrite files (V1+V2): rewrite_data_files.go:83 - Row delta (V2): row_delta.go:63 - Write equality deletes (V2): equality_delete_writer.go:78 --- site/docs/status.md | 84 ++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/site/docs/status.md b/site/docs/status.md index 4ca603e6f5e1..a7d2cfb38567 100644 --- a/site/docs/status.md +++ b/site/docs/status.md @@ -83,29 +83,29 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-----------------------------|------|-----------|------|----|-----| -| Update schema | Y | Y | Y | N | Y | -| Update partition spec | Y | Y | Y | N | Y | +| Update schema | Y | Y | Y | Y | Y | +| Update partition spec | Y | Y | Y | Y | Y | | Update table properties | Y | Y | Y | Y | Y | -| Replace sort order | Y | N | Y | N | Y | -| Update table location | Y | Y | Y | N | Y | -| Update statistics | Y | Y | Y | N | Y | +| Replace sort order | Y | N | Y | Y | Y | +| Update table location | Y | Y | Y | Y | Y | +| Update statistics | Y | Y | Y | Y | Y | | Update partition statistics | Y | N | N | N | N | -| Expire snapshots | Y | N | N | N | N | -| Manage snapshots | Y | N | N | N | N | +| Expire snapshots | Y | N | N | Y | N | +| Manage snapshots | Y | N | N | Y | N | ### Table Spec V2 | Operation | Java | PyIceberg | Rust | Go | C++ | |-----------------------------|------|-----------|------|----|-----| -| Update schema | Y | Y | N | N | Y | -| Update partition spec | Y | Y | N | N | Y | +| Update schema | Y | Y | N | Y | Y | +| Update partition spec | Y | Y | N | Y | Y | | Update table properties | Y | Y | Y | Y | Y | -| Replace sort order | Y | N | Y | N | Y | -| Update table location | Y | Y | Y | N | Y | -| Update statistics | Y | Y | Y | N | Y | +| Replace sort order | Y | N | Y | Y | Y | +| Update table location | Y | Y | Y | Y | Y | +| Update statistics | Y | Y | Y | Y | Y | | Update partition statistics | Y | N | N | N | N | -| Expire snapshots | Y | N | N | N | N | -| Manage snapshots | Y | N | N | N | N | +| Expire snapshots | Y | N | N | Y | N | +| Manage snapshots | Y | N | N | Y | N | ## Table Update Operations @@ -114,21 +114,21 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |-------------------|------|-----------|------|----|-----| | Append data files | Y | Y | Y | Y | Y | -| Rewrite files | Y | Y | N | N | N | +| Rewrite files | Y | Y | N | Y | N | | Rewrite manifests | Y | Y | N | Y | N | -| Overwrite files | Y | Y | N | N | N | -| Delete files | Y | Y | N | N | N | +| Overwrite files | Y | Y | N | Y | N | +| Delete files | Y | Y | N | Y | N | ### Table Spec V2 | Operation | Java | PyIceberg | Rust | Go | C++ | |-------------------|------|-----------|------|----|-----| | Append data files | Y | Y | Y | Y | Y | -| Rewrite files | Y | Y | N | N | N | +| Rewrite files | Y | Y | N | Y | N | | Rewrite manifests | Y | Y | N | Y | N | -| Overwrite files | Y | Y | N | N | N | -| Row delta | Y | N | N | N | N | -| Delete files | Y | Y | N | N | N | +| Overwrite files | Y | Y | N | Y | N | +| Row delta | Y | N | N | Y | N | +| Delete files | Y | Y | N | Y | N | ## Table Read Operations @@ -146,11 +146,11 @@ This section lists the libraries that implement the Apache Iceberg specification |-----------------------------|------|-----------|------|----|-----| | Plan with data file | Y | Y | Y | Y | Y | | Plan with position deletes | Y | Y | Y | Y | Y | -| Plan with equality deletes | Y | Y | Y | N | Y | +| Plan with equality deletes | Y | Y | Y | Y | Y | | Plan with puffin statistics | Y | N | N | N | N | | Read data file | Y | Y | Y | Y | Y | | Read with position deletes | Y | Y | Y | Y | N | -| Read with equality deletes | Y | N | Y | N | N | +| Read with equality deletes | Y | N | Y | Y | N | ## Table Write Operations @@ -165,8 +165,8 @@ This section lists the libraries that implement the Apache Iceberg specification | Operation | Java | PyIceberg | Rust | Go | C++ | |------------------------|------|-----------|------|----|-----| | Append data | Y | Y | Y | Y | N | -| Write position deletes | Y | N | N | N | N | -| Write equality deletes | Y | N | Y | N | N | +| Write position deletes | Y | N | N | Y | N | +| Write equality deletes | Y | N | Y | Y | N | ## Catalogs @@ -200,10 +200,10 @@ This section lists the libraries that implement the Apache Iceberg specification | View Operation | Java | PyIceberg | Rust | Go | C++ | |----------------|------|-----------|------|----|-----| -| createView | Y | N | N | N | N | -| dropView | Y | Y | N | N | N | -| listView | Y | Y | N | N | N | -| viewExists | Y | Y | N | N | N | +| createView | Y | N | N | Y | N | +| dropView | Y | Y | N | Y | N | +| listView | Y | Y | N | Y | N | +| viewExists | Y | Y | N | Y | N | | replaceView | Y | N | N | N | N | | renameView | Y | N | N | N | N | @@ -256,10 +256,10 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | View Operation | Java | PyIceberg | Rust | Go | C++ | |----------------|------|-----------|------|----|-----| -| createView | Y | N | N | N | N | -| dropView | Y | N | N | N | N | -| listView | Y | N | N | N | N | -| viewExists | Y | N | N | N | N | +| createView | Y | N | N | Y | N | +| dropView | Y | N | N | Y | N | +| listView | Y | N | N | Y | N | +| viewExists | Y | N | N | Y | N | | replaceView | Y | N | N | N | N | | renameView | Y | N | N | N | N | @@ -352,10 +352,10 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | View Operation | Java | PyIceberg | Rust | Go | C++ | |----------------|------|-----------|------|----|-----| -| createView | Y | N | N | N | N | -| dropView | Y | N | N | N | N | -| listView | Y | N | N | N | N | -| viewExists | Y | N | N | N | N | +| createView | Y | N | N | Y | N | +| dropView | Y | N | N | Y | N | +| listView | Y | N | N | Y | N | +| viewExists | Y | N | N | Y | N | | replaceView | Y | N | N | N | N | | renameView | Y | N | N | N | N | @@ -363,9 +363,9 @@ The sql catalog is a catalog backed by a sql database, which is called jdbc cata | Namespace Operation | Java | PyIceberg | Rust | Go | C++ | |---------------------------|------|-----------|------|----|-----| -| listNamespaces | Y | Y | Y | N | N | -| createNamespace | Y | Y | Y | N | N | -| dropNamespace | Y | Y | Y | N | N | -| namespaceExists | Y | N | Y | N | N | +| listNamespaces | Y | Y | Y | Y | N | +| createNamespace | Y | Y | Y | Y | N | +| dropNamespace | Y | Y | Y | Y | N | +| namespaceExists | Y | N | Y | Y | N | | updateNamespaceProperties | Y | Y | Y | Y | N | -| loadNamespaceMetadata | Y | Y | Y | N | N | +| loadNamespaceMetadata | Y | Y | Y | Y | N | From 54cdbcddee30610ce4e4c9b7f685d3b90a90bfea Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 18 Apr 2026 22:30:49 -0700 Subject: [PATCH 068/197] Build: Bump mkdocs-rss-plugin from 1.17.9 to 1.18.1 (#16036) Bumps [mkdocs-rss-plugin](https://github.com/guts/mkdocs-rss-plugin) from 1.17.9 to 1.18.1. - [Release notes](https://github.com/guts/mkdocs-rss-plugin/releases) - [Changelog](https://github.com/Guts/mkdocs-rss-plugin/blob/main/CHANGELOG.md) - [Commits](https://github.com/guts/mkdocs-rss-plugin/compare/1.17.9...1.18.1) --- updated-dependencies: - dependency-name: mkdocs-rss-plugin dependency-version: 1.18.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- site/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/requirements.txt b/site/requirements.txt index 130842d75f92..359d2904619e 100644 --- a/site/requirements.txt +++ b/site/requirements.txt @@ -21,5 +21,5 @@ mkdocs-material==9.7.5 mkdocs-material-extensions==1.3.1 mkdocs-monorepo-plugin @ git+https://github.com/bitsondatadev/mkdocs-monorepo-plugin@url-fix mkdocs-redirects==1.2.3 -mkdocs-rss-plugin==1.17.9 +mkdocs-rss-plugin==1.18.1 pymarkdownlnt==0.9.36 From f66305aecb789e7583f26ccbdb922ba1c8175f1c Mon Sep 17 00:00:00 2001 From: drexler-sky Date: Sat, 18 Apr 2026 22:58:23 -0700 Subject: [PATCH 069/197] Flink 2.1: Fix forward-writer chaining regression in DynamicIcebergSink (#16026) --- .../apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java | 1 + 1 file changed, 1 insertion(+) diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 9c8a4d156ff9..7b0de6fbe9e3 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -443,6 +443,7 @@ private DynamicIcebergSink build( operatorName("Forward-Writer"), writeResultTypeInfo, new SinkWriterOperatorFactory<>(forwardWriterSink)) + .setParallelism(converted.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-forward-writer")); // Inject forward write results into sink — they'll be unioned in addPreCommitTopology From 3111ba588b7e5b1bb2ee3e3aa7855e12e42420eb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 19 Apr 2026 07:15:37 -0700 Subject: [PATCH 070/197] Build: Bump com.azure:azure-sdk-bom from 1.3.5 to 1.3.6 (#16037) Bumps [com.azure:azure-sdk-bom](https://github.com/azure/azure-sdk-for-java) from 1.3.5 to 1.3.6. - [Release notes](https://github.com/azure/azure-sdk-for-java/releases) - [Commits](https://github.com/azure/azure-sdk-for-java/compare/azure-identity_1.3.5...azure-identity_1.3.6) --- updated-dependencies: - dependency-name: com.azure:azure-sdk-bom dependency-version: 1.3.6 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 776b6c88f597..340897d8cf36 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -34,7 +34,7 @@ avro = "1.12.1" assertj-core = "3.27.7" awaitility = "4.3.0" awssdk-bom = "2.42.28" -azuresdk-bom = "1.3.5" +azuresdk-bom = "1.3.6" awssdk-s3accessgrants = "2.4.1" bson-ver = "4.11.5" caffeine = "2.9.3" From c66bd68e86d52ae6df219e7c6b9162890fe64990 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 19 Apr 2026 07:15:58 -0700 Subject: [PATCH 071/197] Build: Bump at.yawk.lz4:lz4-java from 1.10.4 to 1.11.0 (#16038) Bumps [at.yawk.lz4:lz4-java](https://github.com/yawkat/lz4-java) from 1.10.4 to 1.11.0. - [Release notes](https://github.com/yawkat/lz4-java/releases) - [Changelog](https://github.com/yawkat/lz4-java/blob/main/CHANGES.md) - [Commits](https://github.com/yawkat/lz4-java/compare/v1.10.4...v1.11.0) --- updated-dependencies: - dependency-name: at.yawk.lz4:lz4-java dependency-version: 1.11.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 340897d8cf36..0ebadac16c73 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -71,7 +71,7 @@ junit-platform = "1.14.3" junit-pioneer = "2.3.0" kafka = "3.9.2" kryo-shaded = "4.0.3" -lz4Java = "1.10.4" +lz4Java = "1.11.0" microprofile-openapi-api = "3.1.2" mockito = "4.11.0" mockserver = "5.15.0" From 9205427c5d02a58917eadafd7823e0d722049417 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 19 Apr 2026 07:16:15 -0700 Subject: [PATCH 072/197] Build: Bump com.google.errorprone:error_prone_annotations (#16039) Bumps [com.google.errorprone:error_prone_annotations](https://github.com/google/error-prone) from 2.48.0 to 2.49.0. - [Release notes](https://github.com/google/error-prone/releases) - [Commits](https://github.com/google/error-prone/compare/v2.48.0...v2.49.0) --- updated-dependencies: - dependency-name: com.google.errorprone:error_prone_annotations dependency-version: 2.49.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 0ebadac16c73..bda1999067a7 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -44,7 +44,7 @@ delta-standalone = "3.3.2" delta-spark = "3.3.2" derby = "10.15.2.0" esotericsoftware-kryo = "4.0.3" -errorprone-annotations = "2.48.0" +errorprone-annotations = "2.49.0" failsafe = "3.3.2" findbugs-jsr305 = "3.0.2" flink120 = { strictly = "1.20.1"} From 55f892319457e569c36cec2e72321a2f89745250 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 19 Apr 2026 07:17:11 -0700 Subject: [PATCH 073/197] Build: Bump docker/build-push-action from 7.0.0 to 7.1.0 (#16041) Bumps [docker/build-push-action](https://github.com/docker/build-push-action) from 7.0.0 to 7.1.0. - [Release notes](https://github.com/docker/build-push-action/releases) - [Commits](https://github.com/docker/build-push-action/compare/d08e5c354a6adb9ed34480a06d141179aa583294...bcafcacb16a39f128d818304e6c9c0c18556b85f) --- updated-dependencies: - dependency-name: docker/build-push-action dependency-version: 7.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/publish-iceberg-rest-fixture-docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-iceberg-rest-fixture-docker.yml b/.github/workflows/publish-iceberg-rest-fixture-docker.yml index 6976e11ac44f..fabc62399c08 100644 --- a/.github/workflows/publish-iceberg-rest-fixture-docker.yml +++ b/.github/workflows/publish-iceberg-rest-fixture-docker.yml @@ -69,7 +69,7 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 - name: Build and Push - uses: docker/build-push-action@d08e5c354a6adb9ed34480a06d141179aa583294 # v7.0.0 + uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # v7.1.0 with: context: ./ file: ./docker/iceberg-rest-fixture/Dockerfile From bfccee96be68fa0182b04e6f93251b5cd5a9ea69 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 19 Apr 2026 07:17:28 -0700 Subject: [PATCH 074/197] Build: Bump org.roaringbitmap:RoaringBitmap from 1.6.13 to 1.6.14 (#16042) Bumps [org.roaringbitmap:RoaringBitmap](https://github.com/RoaringBitmap/RoaringBitmap) from 1.6.13 to 1.6.14. - [Release notes](https://github.com/RoaringBitmap/RoaringBitmap/releases) - [Commits](https://github.com/RoaringBitmap/RoaringBitmap/commits) --- updated-dependencies: - dependency-name: org.roaringbitmap:RoaringBitmap dependency-version: 1.6.14 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index bda1999067a7..43fab457ffc8 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -80,7 +80,7 @@ netty-buffer = "4.2.12.Final" object-client-bundle = "3.3.2" orc = "1.9.8" parquet = "1.17.0" -roaringbitmap = "1.6.13" +roaringbitmap = "1.6.14" scala-collection-compat = "2.14.0" slf4j = "2.0.17" snowflake-jdbc = "3.28.0" From 49839545d0027f17970868c5044f8005abce1267 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 19 Apr 2026 10:21:37 -0700 Subject: [PATCH 075/197] Build: Bump software.amazon.awssdk:bom from 2.42.28 to 2.42.33 (#16040) Bumps software.amazon.awssdk:bom from 2.42.28 to 2.42.33. --- updated-dependencies: - dependency-name: software.amazon.awssdk:bom dependency-version: 2.42.33 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 43fab457ffc8..18bc02e3c023 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,7 +33,7 @@ arrow = "15.0.2" avro = "1.12.1" assertj-core = "3.27.7" awaitility = "4.3.0" -awssdk-bom = "2.42.28" +awssdk-bom = "2.42.33" azuresdk-bom = "1.3.6" awssdk-s3accessgrants = "2.4.1" bson-ver = "4.11.5" From f984c28b215c56846c632ccc5a368f4f4afe0b5d Mon Sep 17 00:00:00 2001 From: Han You Date: Mon, 20 Apr 2026 09:07:47 -0500 Subject: [PATCH 076/197] Flink: Backport add passthroughRecords option to DynamicIcebergSink (#16019) Backports #15433 and #16026 Co-authored-by: Han You --- .../sink/dynamic/DynamicIcebergSink.java | 134 ++++++++++++-- .../flink/sink/dynamic/DynamicRecord.java | 26 ++- .../sink/dynamic/DynamicRecordProcessor.java | 58 ++++-- .../sink/dynamic/TestDynamicIcebergSink.java | 153 +++++++++++++++- .../sink/dynamic/DynamicIcebergSink.java | 133 ++++++++++++-- .../flink/sink/dynamic/DynamicRecord.java | 26 ++- .../sink/dynamic/DynamicRecordProcessor.java | 58 ++++-- .../sink/dynamic/TestDynamicIcebergSink.java | 165 +++++++++++++++++- 8 files changed, 663 insertions(+), 90 deletions(-) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 63f5e6191193..ee56e39577e1 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -44,6 +44,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.runtime.operators.sink.SinkWriterOperatorFactory; import org.apache.flink.table.data.RowData; import org.apache.flink.util.OutputTag; import org.apache.iceberg.Table; @@ -79,13 +80,17 @@ public class DynamicIcebergSink private final Configuration flinkConfig; private final int cacheMaximumSize; + // Set by the builder before sinkTo() — forward writer results to union into pre-commit topology + private final transient DataStream> forwardWriteResults; + DynamicIcebergSink( CatalogLoader catalogLoader, Map snapshotProperties, String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWriteResults) { this.catalogLoader = catalogLoader; this.snapshotProperties = snapshotProperties; this.uidPrefix = uidPrefix; @@ -96,6 +101,7 @@ public class DynamicIcebergSink // This is used to separate files generated by different sinks writing the same table. // Also used to generate the aggregator operator name this.sinkId = UUID.randomUUID().toString(); + this.forwardWriteResults = forwardWriteResults; } @SuppressWarnings("deprecation") @@ -145,7 +151,11 @@ public DataStream> addPreCommitTopology( TypeInformation> typeInformation = CommittableMessageTypeInfo.of(this::getCommittableSerializer); - return writeResults + // Union forward writer results with the shuffle writer results + DataStream> allResults = + writeResults.union(forwardWriteResults); + + return allResults .keyBy( committable -> { if (committable instanceof CommittableSummary) { @@ -168,6 +178,56 @@ public SimpleVersionedSerializer getWriteResultSerializer() return new DynamicWriteResultSerializer(); } + /** + * A lightweight Sink used with {@link SinkWriterOperatorFactory} for the forward write path. + * Implements {@link SupportsCommitter} so that {@code SinkWriterOperator} emits committables + * downstream. The committer is never called — committing is handled by the main sink. + */ + @VisibleForTesting + static class ForwardWriterSink + implements Sink, SupportsCommitter { + + private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; + private final int cacheMaximumSize; + + ForwardWriterSink( + CatalogLoader catalogLoader, + Map writeProperties, + Configuration flinkConfig, + int cacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.writeProperties = writeProperties; + this.flinkConfig = flinkConfig; + this.cacheMaximumSize = cacheMaximumSize; + } + + @SuppressWarnings("deprecation") + @Override + public SinkWriter createWriter(InitContext context) throws IOException { + return new DynamicWriter( + catalogLoader.loadCatalog(), + writeProperties, + flinkConfig, + cacheMaximumSize, + new DynamicWriterMetrics(context.metricGroup()), + context.getSubtaskId(), + context.getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + throw new UnsupportedOperationException( + "WriterSink is used only for writing; committing is handled by the main sink"); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new DynamicWriteResultSerializer(); + } + } + public static class Builder { private DataStream input; private DynamicRecordGenerator generator; @@ -358,43 +418,80 @@ private String operatorName(String suffix) { return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; } - private DynamicIcebergSink build() { + private DynamicIcebergSink build( + SingleOutputStreamOperator converted, + DynamicRecordInternalType sideOutputType) { Preconditions.checkArgument( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); - Configuration flinkConfig = readableConfig instanceof Configuration ? (Configuration) readableConfig : Configuration.fromMap(readableConfig.toMap()); - return instantiateSink(writeOptions, flinkConfig); + // Forward writer: chained with generator via forward edge, no data shuffle + ForwardWriterSink forwardWriterSink = + new ForwardWriterSink(catalogLoader, writeOptions, flinkConfig, cacheMaximumSize); + TypeInformation> writeResultTypeInfo = + CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); + + DataStream> forwardWriteResults = + converted + .getSideOutput( + new OutputTag<>(DynamicRecordProcessor.DYNAMIC_FORWARD_STREAM, sideOutputType)) + .transform( + operatorName("Forward-Writer"), + writeResultTypeInfo, + new SinkWriterOperatorFactory<>(forwardWriterSink)) + .setParallelism(converted.getParallelism()) + .uid(prefixIfNotNull(uidPrefix, "-forward-writer")); + + // Inject forward write results into sink — they'll be unioned in addPreCommitTopology + return instantiateSink(writeOptions, flinkConfig, forwardWriteResults); } @VisibleForTesting DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkWriteConf) { + Map writeProperties, + Configuration flinkWriteConf, + DataStream> forwardWriteResults) { return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize); + cacheMaximumSize, + forwardWriteResults); } /** * Append the iceberg sink operators to write records to iceberg table. * + *

The topology splits records by distribution mode: + * + *

    + *
  • Forward records ({@code null} distributionMode) go through a forward edge to a chained + * writer, avoiding any data shuffle. + *
  • Shuffle records (non-null distributionMode) go through the standard Sink2 pipeline with + * hash/round-robin distribution. + *
+ * + * Both writers feed into a single shared pre-commit aggregator and committer, ensuring atomic + * commits across both paths. + * * @return {@link DataStreamSink} for sink. */ public DataStreamSink append() { + uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + DynamicRecordInternalType type = new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); - DynamicIcebergSink sink = build(); + DynamicRecordInternalType sideOutputType = + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize); + SingleOutputStreamOperator converted = input .process( @@ -413,12 +510,14 @@ public DataStreamSink append() { .name(operatorName("generator")) .returns(type); - DataStreamSink rowDataDataStreamSink = + DynamicIcebergSink sink = build(converted, sideOutputType); + + // Shuffle path: table update side output + main output → sinkTo() + DataStream shuffleInput = converted .getSideOutput( new OutputTag<>( - DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) + DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( new DynamicTableUpdateOperator( @@ -432,16 +531,19 @@ public DataStreamSink append() { .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) - .union(converted) - .sinkTo(sink) + .union(converted); + + DataStreamSink result = + shuffleInput + .sinkTo(sink) // Forward write results are implicitly injected here .uid(prefixIfNotNull(uidPrefix, "-sink")); FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); if (flinkWriteConf.writeParallelism() != null) { - rowDataDataStreamSink.setParallelism(flinkWriteConf.writeParallelism()); + result.setParallelism(flinkWriteConf.writeParallelism()); } - return rowDataDataStreamSink; + return result; } } diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 9f445766083e..15b83a589382 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -34,20 +34,40 @@ public class DynamicRecord { private Schema schema; private RowData rowData; private PartitionSpec partitionSpec; - private DistributionMode distributionMode; + @Nullable private DistributionMode distributionMode; private int writeParallelism; private boolean upsertMode; @Nullable private Set equalityFields; /** - * Constructs a new DynamicRecord. + * Constructs a new DynamicRecord with forward (no shuffle) writes. * * @param tableIdentifier The target table identifier. * @param branch The target table branch. * @param schema The target table schema. * @param rowData The data matching the provided schema. * @param partitionSpec The target table {@link PartitionSpec}. - * @param distributionMode The {@link DistributionMode}. + */ + public DynamicRecord( + TableIdentifier tableIdentifier, + String branch, + Schema schema, + RowData rowData, + PartitionSpec partitionSpec) { + this(tableIdentifier, branch, schema, rowData, partitionSpec, null, -1); + } + + /** + * Constructs a new DynamicRecord. This record will be shuffled as specified by {@code + * distributionMode}. + * + * @param tableIdentifier The target table identifier. + * @param branch The target table branch. + * @param schema The target table schema. + * @param rowData The data matching the provided schema. + * @param partitionSpec The target table {@link PartitionSpec}. + * @param distributionMode The {@link DistributionMode}. {@code null} indicates forward (no + * shuffle) writes. * @param writeParallelism The number of parallel writers. Can be set to any value {@literal > 0}, * but will always be automatically capped by the maximum write parallelism, which is the * parallelism of the sink. Set to Integer.MAX_VALUE for always using the maximum available diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index 07dfad2780f7..fc6892b2cd9e 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -37,6 +37,8 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; private final boolean immediateUpdate; @@ -51,6 +53,7 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; + private transient OutputTag forwardStream; private transient Collector collector; private transient Context context; @@ -90,9 +93,14 @@ public void open(OpenContext openContext) throws Exception { this.hashKeyGenerator = new HashKeyGenerator( cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); - if (immediateUpdate) { - updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); - } else { + // Always create updater — needed for forced immediate updates on forward records + this.updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); + // Always create forward stream tag for forward (distributionMode == null) records + this.forwardStream = + new OutputTag<>( + DYNAMIC_FORWARD_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; + if (!immediateUpdate) { updateStream = new OutputTag<>( DYNAMIC_TABLE_UPDATE_STREAM, @@ -112,6 +120,8 @@ public void processElement(T element, Context ctx, Collector newData = updater.update( data.tableIdentifier(), data.branch(), data.schema(), data.spec(), tableCreator); emit( - collector, data, newData.f0.resolvedTableSchema(), newData.f0.recordConverter(), - newData.f1); + newData.f1, + isForward); } else { + // Shuffled records with immediateUpdate=false go to the update side output int writerKey = hashKeyGenerator.generateKey( data, @@ -159,33 +174,38 @@ public void collect(DynamicRecord data) { } } else { emit( - collector, data, foundSchema.resolvedTableSchema(), foundSchema.recordConverter(), - foundSpec); + foundSpec, + isForward); } } private void emit( - Collector out, DynamicRecord data, Schema schema, DataConverter recordConverter, - PartitionSpec spec) { + PartitionSpec spec, + boolean forward) { RowData rowData = (RowData) recordConverter.convert(data.rowData()); - int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); - String tableName = data.tableIdentifier().toString(); - out.collect( + // writerKey is unused in the forward path. + int writerKey = forward ? -1 : hashKeyGenerator.generateKey(data, schema, spec, rowData); + DynamicRecordInternal record = new DynamicRecordInternal( - tableName, + data.tableIdentifier().toString(), data.branch(), schema, rowData, spec, writerKey, data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema)); + if (forward) { + context.output(forwardStream, record); + } else { + collector.collect(record); + } } @Override diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index aa92ae8ceb34..ecdbc3128525 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -44,6 +44,8 @@ import org.apache.flink.configuration.RestartStrategyOptions; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -213,6 +215,56 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { } } + /** Generator that always emits forward (null distributionMode) records. */ + private static class ForwardGenerator implements DynamicRecordGenerator { + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** + * Generator that alternates between forward (null distributionMode) and shuffle records. Even + * indices go forward, odd indices go through shuffle. + */ + private static class MixedGenerator implements DynamicRecordGenerator { + private int count = 0; + + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + boolean forward = (count++ % 2 == 0); + DistributionMode mode = + forward ? null : (spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE); + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec, + mode, + 10); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + private static DataFormatConverters.RowConverter converter(Schema schema) { RowType rowType = FlinkSchemaUtil.convert(schema); ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(rowType); @@ -238,6 +290,85 @@ void testWrite() throws Exception { runTest(rows); } + @Test + void testNoShuffleTopology() throws Exception { + DataStream dataStream = + env.fromData( + Collections.emptyList(), TypeInformation.of(new TypeHint() {})); + DynamicIcebergSink.forInput(dataStream) + .generator(new ForwardGenerator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(2) + .immediateTableUpdate(false) + .overwrite(false) + .append(); + + boolean generatorAndSinkChained = false; + for (JobVertex vertex : env.getStreamGraph().getJobGraph().getVertices()) { + String vertexName = vertex.getName(); + boolean generatorInThisVertex = vertexName.contains("-generator"); + boolean sinkInThisVertex = vertexName.contains("-Forward-Writer"); + + generatorAndSinkChained = generatorInThisVertex && sinkInThisVertex; + if (generatorAndSinkChained) { + break; + } + } + + assertThat(generatorAndSinkChained).isTrue(); + } + + @Test + void testForwardWrite() throws Exception { + runForwardWriteTest(new ForwardGenerator()); + } + + @Test + void testMixedForwardAndShuffleWrite() throws Exception { + runForwardWriteTest(new MixedGenerator()); + } + + private void runForwardWriteTest(DynamicRecordGenerator generator) + throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned())); + + DataStream dataStream = + env.fromData(rows, TypeInformation.of(new TypeHint<>() {})); + env.setParallelism(1); + + DynamicIcebergSink.forInput(dataStream) + .generator(generator) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(1) + .immediateTableUpdate(true) + .append(); + + env.execute(); + + verifyResults(rows); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1170,8 +1301,9 @@ void testOperatorUidsFormat() { // pre commit topology was off, but since it is stateless, users will still be able to restore // state, but we must keep the stateful operators UUIds like the committer consistent. assertThat(sinkUids) - .contains( + .containsOnly( "test--sink", + "test--forward-writer", "test--generator", "test--updater", "test--sink: test--pre-commit-topology", @@ -1179,8 +1311,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(""); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1188,8 +1321,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(null); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1386,7 +1520,9 @@ static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.B @Override DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkConfig) { + Map writeProperties, + Configuration flinkConfig, + DataStream> forwardWriteResults) { return new CommitHookDynamicIcebergSink( commitHook, CATALOG_EXTENSION.catalogLoader(), @@ -1394,7 +1530,8 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100); + 100, + forwardWriteResults); } } @@ -1410,14 +1547,16 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWritten) { super( catalogLoader, snapshotProperties, uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize); + cacheMaximumSize, + forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index a00eba492a19..7b0de6fbe9e3 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -44,6 +44,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSink; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; +import org.apache.flink.streaming.runtime.operators.sink.SinkWriterOperatorFactory; import org.apache.flink.table.data.RowData; import org.apache.flink.util.OutputTag; import org.apache.iceberg.Table; @@ -79,13 +80,17 @@ public class DynamicIcebergSink private final Configuration flinkConfig; private final int cacheMaximumSize; + // Set by the builder before sinkTo() — forward writer results to union into pre-commit topology + private final transient DataStream> forwardWriteResults; + DynamicIcebergSink( CatalogLoader catalogLoader, Map snapshotProperties, String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWriteResults) { this.catalogLoader = catalogLoader; this.snapshotProperties = snapshotProperties; this.uidPrefix = uidPrefix; @@ -96,6 +101,7 @@ public class DynamicIcebergSink // This is used to separate files generated by different sinks writing the same table. // Also used to generate the aggregator operator name this.sinkId = UUID.randomUUID().toString(); + this.forwardWriteResults = forwardWriteResults; } @Override @@ -144,7 +150,11 @@ public DataStream> addPreCommitTopology( TypeInformation> typeInformation = CommittableMessageTypeInfo.of(this::getCommittableSerializer); - return writeResults + // Union forward writer results with the shuffle writer results + DataStream> allResults = + writeResults.union(forwardWriteResults); + + return allResults .keyBy( committable -> { if (committable instanceof CommittableSummary) { @@ -167,6 +177,55 @@ public SimpleVersionedSerializer getWriteResultSerializer() return new DynamicWriteResultSerializer(); } + /** + * A lightweight Sink used with {@link SinkWriterOperatorFactory} for the forward write path. + * Implements {@link SupportsCommitter} so that {@code SinkWriterOperator} emits committables + * downstream. The committer is never called — committing is handled by the main sink. + */ + @VisibleForTesting + static class ForwardWriterSink + implements Sink, SupportsCommitter { + + private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; + private final int cacheMaximumSize; + + ForwardWriterSink( + CatalogLoader catalogLoader, + Map writeProperties, + Configuration flinkConfig, + int cacheMaximumSize) { + this.catalogLoader = catalogLoader; + this.writeProperties = writeProperties; + this.flinkConfig = flinkConfig; + this.cacheMaximumSize = cacheMaximumSize; + } + + @Override + public SinkWriter createWriter(WriterInitContext context) { + return new DynamicWriter( + catalogLoader.loadCatalog(), + writeProperties, + flinkConfig, + cacheMaximumSize, + new DynamicWriterMetrics(context.metricGroup()), + context.getTaskInfo().getIndexOfThisSubtask(), + context.getTaskInfo().getAttemptNumber()); + } + + @Override + public Committer createCommitter(CommitterInitContext context) { + throw new UnsupportedOperationException( + "WriterSink is used only for writing; committing is handled by the main sink"); + } + + @Override + public SimpleVersionedSerializer getCommittableSerializer() { + return new DynamicWriteResultSerializer(); + } + } + public static class Builder { private DataStream input; private DynamicRecordGenerator generator; @@ -357,43 +416,80 @@ private String operatorName(String suffix) { return uidPrefix != null ? uidPrefix + "-" + suffix : suffix; } - private DynamicIcebergSink build() { + private DynamicIcebergSink build( + SingleOutputStreamOperator converted, + DynamicRecordInternalType sideOutputType) { Preconditions.checkArgument( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); - Configuration flinkConfig = readableConfig instanceof Configuration ? (Configuration) readableConfig : Configuration.fromMap(readableConfig.toMap()); - return instantiateSink(writeOptions, flinkConfig); + // Forward writer: chained with generator via forward edge, no data shuffle + ForwardWriterSink forwardWriterSink = + new ForwardWriterSink(catalogLoader, writeOptions, flinkConfig, cacheMaximumSize); + TypeInformation> writeResultTypeInfo = + CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); + + DataStream> forwardWriteResults = + converted + .getSideOutput( + new OutputTag<>(DynamicRecordProcessor.DYNAMIC_FORWARD_STREAM, sideOutputType)) + .transform( + operatorName("Forward-Writer"), + writeResultTypeInfo, + new SinkWriterOperatorFactory<>(forwardWriterSink)) + .setParallelism(converted.getParallelism()) + .uid(prefixIfNotNull(uidPrefix, "-forward-writer")); + + // Inject forward write results into sink — they'll be unioned in addPreCommitTopology + return instantiateSink(writeOptions, flinkConfig, forwardWriteResults); } @VisibleForTesting DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkWriteConf) { + Map writeProperties, + Configuration flinkWriteConf, + DataStream> forwardWriteResults) { return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize); + cacheMaximumSize, + forwardWriteResults); } /** * Append the iceberg sink operators to write records to iceberg table. * + *

The topology splits records by distribution mode: + * + *

    + *
  • Forward records ({@code null} distributionMode) go through a forward edge to a chained + * writer, avoiding any data shuffle. + *
  • Shuffle records (non-null distributionMode) go through the standard Sink2 pipeline with + * hash/round-robin distribution. + *
+ * + * Both writers feed into a single shared pre-commit aggregator and committer, ensuring atomic + * commits across both paths. + * * @return {@link DataStreamSink} for sink. */ public DataStreamSink append() { + uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + DynamicRecordInternalType type = new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); - DynamicIcebergSink sink = build(); + DynamicRecordInternalType sideOutputType = + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize); + SingleOutputStreamOperator converted = input .process( @@ -412,12 +508,14 @@ public DataStreamSink append() { .name(operatorName("generator")) .returns(type); - DataStreamSink rowDataDataStreamSink = + DynamicIcebergSink sink = build(converted, sideOutputType); + + // Shuffle path: table update side output + main output → sinkTo() + DataStream shuffleInput = converted .getSideOutput( new OutputTag<>( - DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize))) + DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( new DynamicTableUpdateOperator( @@ -431,16 +529,19 @@ public DataStreamSink append() { .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) - .union(converted) - .sinkTo(sink) + .union(converted); + + DataStreamSink result = + shuffleInput + .sinkTo(sink) // Forward write results are implicitly injected here .uid(prefixIfNotNull(uidPrefix, "-sink")); FlinkWriteConf flinkWriteConf = new FlinkWriteConf(writeOptions, readableConfig); if (flinkWriteConf.writeParallelism() != null) { - rowDataDataStreamSink.setParallelism(flinkWriteConf.writeParallelism()); + result.setParallelism(flinkWriteConf.writeParallelism()); } - return rowDataDataStreamSink; + return result; } } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 9f445766083e..15b83a589382 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -34,20 +34,40 @@ public class DynamicRecord { private Schema schema; private RowData rowData; private PartitionSpec partitionSpec; - private DistributionMode distributionMode; + @Nullable private DistributionMode distributionMode; private int writeParallelism; private boolean upsertMode; @Nullable private Set equalityFields; /** - * Constructs a new DynamicRecord. + * Constructs a new DynamicRecord with forward (no shuffle) writes. * * @param tableIdentifier The target table identifier. * @param branch The target table branch. * @param schema The target table schema. * @param rowData The data matching the provided schema. * @param partitionSpec The target table {@link PartitionSpec}. - * @param distributionMode The {@link DistributionMode}. + */ + public DynamicRecord( + TableIdentifier tableIdentifier, + String branch, + Schema schema, + RowData rowData, + PartitionSpec partitionSpec) { + this(tableIdentifier, branch, schema, rowData, partitionSpec, null, -1); + } + + /** + * Constructs a new DynamicRecord. This record will be shuffled as specified by {@code + * distributionMode}. + * + * @param tableIdentifier The target table identifier. + * @param branch The target table branch. + * @param schema The target table schema. + * @param rowData The data matching the provided schema. + * @param partitionSpec The target table {@link PartitionSpec}. + * @param distributionMode The {@link DistributionMode}. {@code null} indicates forward (no + * shuffle) writes. * @param writeParallelism The number of parallel writers. Can be set to any value {@literal > 0}, * but will always be automatically capped by the maximum write parallelism, which is the * parallelism of the sink. Set to Integer.MAX_VALUE for always using the maximum available diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index 07dfad2780f7..fc6892b2cd9e 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -37,6 +37,8 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; private final boolean immediateUpdate; @@ -51,6 +53,7 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; + private transient OutputTag forwardStream; private transient Collector collector; private transient Context context; @@ -90,9 +93,14 @@ public void open(OpenContext openContext) throws Exception { this.hashKeyGenerator = new HashKeyGenerator( cacheMaximumSize, getRuntimeContext().getTaskInfo().getMaxNumberOfParallelSubtasks()); - if (immediateUpdate) { - updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); - } else { + // Always create updater — needed for forced immediate updates on forward records + this.updater = new TableUpdater(tableCache, catalog, caseSensitive, dropUnusedColumns); + // Always create forward stream tag for forward (distributionMode == null) records + this.forwardStream = + new OutputTag<>( + DYNAMIC_FORWARD_STREAM, + new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; + if (!immediateUpdate) { updateStream = new OutputTag<>( DYNAMIC_TABLE_UPDATE_STREAM, @@ -112,6 +120,8 @@ public void processElement(T element, Context ctx, Collector newData = updater.update( data.tableIdentifier(), data.branch(), data.schema(), data.spec(), tableCreator); emit( - collector, data, newData.f0.resolvedTableSchema(), newData.f0.recordConverter(), - newData.f1); + newData.f1, + isForward); } else { + // Shuffled records with immediateUpdate=false go to the update side output int writerKey = hashKeyGenerator.generateKey( data, @@ -159,33 +174,38 @@ public void collect(DynamicRecord data) { } } else { emit( - collector, data, foundSchema.resolvedTableSchema(), foundSchema.recordConverter(), - foundSpec); + foundSpec, + isForward); } } private void emit( - Collector out, DynamicRecord data, Schema schema, DataConverter recordConverter, - PartitionSpec spec) { + PartitionSpec spec, + boolean forward) { RowData rowData = (RowData) recordConverter.convert(data.rowData()); - int writerKey = hashKeyGenerator.generateKey(data, schema, spec, rowData); - String tableName = data.tableIdentifier().toString(); - out.collect( + // writerKey is unused in the forward path. + int writerKey = forward ? -1 : hashKeyGenerator.generateKey(data, schema, spec, rowData); + DynamicRecordInternal record = new DynamicRecordInternal( - tableName, + data.tableIdentifier().toString(), data.branch(), schema, rowData, spec, writerKey, data.upsertMode(), - DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema))); + DynamicSinkUtil.getEqualityFieldIds(data.equalityFields(), schema)); + if (forward) { + context.output(forwardStream, record); + } else { + collector.collect(record); + } } @Override diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index aa92ae8ceb34..4e7511501014 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -43,7 +43,10 @@ import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.RestartStrategyOptions; import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.flink.runtime.OperatorIDPair; import org.apache.flink.runtime.client.JobExecutionException; +import org.apache.flink.runtime.jobgraph.JobVertex; +import org.apache.flink.streaming.api.connector.sink2.CommittableMessage; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; @@ -213,6 +216,56 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { } } + /** Generator that always emits forward (null distributionMode) records. */ + private static class ForwardGenerator implements DynamicRecordGenerator { + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + + /** + * Generator that alternates between forward (null distributionMode) and shuffle records. Even + * indices go forward, odd indices go through shuffle. + */ + private static class MixedGenerator implements DynamicRecordGenerator { + private int count = 0; + + @Override + public void generate(DynamicIcebergDataImpl row, Collector out) { + TableIdentifier tableIdentifier = TableIdentifier.of(DATABASE, row.tableName); + Schema schema = row.schemaProvided; + PartitionSpec spec = row.partitionSpec; + boolean forward = (count++ % 2 == 0); + DistributionMode mode = + forward ? null : (spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE); + DynamicRecord dynamicRecord = + new DynamicRecord( + tableIdentifier, + row.branch, + schema, + converter(schema).toInternal(row.rowProvided), + spec, + mode, + 10); + dynamicRecord.setUpsertMode(row.upsertMode); + dynamicRecord.setEqualityFields(row.equalityFields); + out.collect(dynamicRecord); + } + } + private static DataFormatConverters.RowConverter converter(Schema schema) { RowType rowType = FlinkSchemaUtil.convert(schema); ResolvedSchema resolvedSchema = FlinkSchemaUtil.toResolvedSchema(rowType); @@ -238,6 +291,96 @@ void testWrite() throws Exception { runTest(rows); } + @Test + void testNoShuffleTopology() throws Exception { + DataStream dataStream = + env.fromData( + Collections.emptyList(), TypeInformation.of(new TypeHint() {})); + DynamicIcebergSink.forInput(dataStream) + .generator(new ForwardGenerator()) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(2) + .immediateTableUpdate(false) + .overwrite(false) + .append(); + + boolean generatorAndSinkChained = false; + for (JobVertex vertex : env.getStreamGraph().getJobGraph().getVertices()) { + boolean generatorInThisVertex = false; + boolean sinkInThisVertex = false; + for (OperatorIDPair operatorID : vertex.getOperatorIDs()) { + String uid = operatorID.getUserDefinedOperatorUid(); + if (uid == null) { + continue; + } + + if (uid.endsWith("-forward-writer")) { + sinkInThisVertex = true; + } else if (uid.endsWith("-generator")) { + generatorInThisVertex = true; + } + } + + generatorAndSinkChained = generatorInThisVertex && sinkInThisVertex; + if (generatorAndSinkChained) { + break; + } + } + + assertThat(generatorAndSinkChained).isTrue(); + } + + @Test + void testForwardWrite() throws Exception { + runForwardWriteTest(new ForwardGenerator()); + } + + @Test + void testMixedForwardAndShuffleWrite() throws Exception { + runForwardWriteTest(new MixedGenerator()); + } + + private void runForwardWriteTest(DynamicRecordGenerator generator) + throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned())); + + DataStream dataStream = + env.fromData(rows, TypeInformation.of(new TypeHint<>() {})); + env.setParallelism(1); + + DynamicIcebergSink.forInput(dataStream) + .generator(generator) + .catalogLoader(CATALOG_EXTENSION.catalogLoader()) + .writeParallelism(1) + .immediateTableUpdate(true) + .append(); + + env.execute(); + + verifyResults(rows); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1170,8 +1313,9 @@ void testOperatorUidsFormat() { // pre commit topology was off, but since it is stateless, users will still be able to restore // state, but we must keep the stateful operators UUIds like the committer consistent. assertThat(sinkUids) - .contains( + .containsOnly( "test--sink", + "test--forward-writer", "test--generator", "test--updater", "test--sink: test--pre-commit-topology", @@ -1179,8 +1323,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(""); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1188,8 +1333,9 @@ void testOperatorUidsFormat() { sinkUids = createSinkAndReturnUIds(null); assertThat(sinkUids) - .contains( + .containsOnly( "--sink", + "--forward-writer", "--generator", "--updater", "--sink: --pre-commit-topology", @@ -1386,7 +1532,9 @@ static class CommitHookEnabledDynamicIcebergSink extends DynamicIcebergSink.B @Override DynamicIcebergSink instantiateSink( - Map writeProperties, Configuration flinkConfig) { + Map writeProperties, + Configuration flinkConfig, + DataStream> forwardWriteResults) { return new CommitHookDynamicIcebergSink( commitHook, CATALOG_EXTENSION.catalogLoader(), @@ -1394,7 +1542,8 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100); + 100, + forwardWriteResults); } } @@ -1410,14 +1559,16 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize) { + int cacheMaximumSize, + DataStream> forwardWritten) { super( catalogLoader, snapshotProperties, uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize); + cacheMaximumSize, + forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); } From a8b7ba6d5ffb47f3e6d4ba937e191c7648c0fe0f Mon Sep 17 00:00:00 2001 From: Alexandre Dutra Date: Mon, 20 Apr 2026 17:21:38 +0200 Subject: [PATCH 077/197] Core: Expose HostnameVerificationPolicy in TLSConfigurer (#15500) * Expose HostnameVerificationPolicy in TLSConfigurer Apache HttpClient 5.4 introduced a new component: `HostnameVerificationPolicy`, which determines whether hostname verification is done by the JSSE provider (at socket level, during TLS handshake), the HttpClient (after TLS handshake), or both. This change exposes `HostnameVerificationPolicy` in `TLSConfigurer`. This component is particularly useful when attempting to bypass hostname verification, e.g. by using the `NoopHostnameVerifier`. The default policy is set to `BOTH`, which produces the same result as before. * set default to CLIENT * declare all BC artifacts * add test * add comment * don't expose HostnameVerificationPolicy * Address review feedback: split try blocks --- build.gradle | 7 ++ .../org/apache/iceberg/rest/HTTPClient.java | 10 +- .../iceberg/rest/auth/TLSConfigurer.java | 12 +- .../apache/iceberg/rest/TestHTTPClient.java | 109 ++++++++++++++++++ gradle/libs.versions.toml | 4 + 5 files changed, 139 insertions(+), 3 deletions(-) diff --git a/build.gradle b/build.gradle index 0715c3f6cb9e..261dfabf0412 100644 --- a/build.gradle +++ b/build.gradle @@ -410,6 +410,13 @@ project(':iceberg-core') { exclude group: 'junit' } testImplementation libs.awaitility + + // Lock BouncyCastle versions to avoid version mismatches + // when these dependencies are added transitively. + // Required for TLS tests with MockServer. + testImplementation libs.bouncycastle.bcpkix + testImplementation libs.bouncycastle.bcutil + testImplementation libs.bouncycastle.bcprov } } diff --git a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java index 86eceba21c95..c359404ec6be 100644 --- a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java +++ b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java @@ -29,6 +29,7 @@ import java.util.concurrent.ConcurrentMap; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +import javax.net.ssl.HostnameVerifier; import org.apache.hc.client5.http.auth.AuthScope; import org.apache.hc.client5.http.auth.CredentialsProvider; import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; @@ -43,6 +44,7 @@ import org.apache.hc.client5.http.io.HttpClientConnectionManager; import org.apache.hc.client5.http.protocol.HttpClientContext; import org.apache.hc.client5.http.ssl.DefaultClientTlsStrategy; +import org.apache.hc.client5.http.ssl.HostnameVerificationPolicy; import org.apache.hc.core5.http.ContentType; import org.apache.hc.core5.http.Header; import org.apache.hc.core5.http.HttpHeaders; @@ -410,13 +412,19 @@ static HttpClientConnectionManager configureConnectionManager(MapIf a non-null verifier is returned, only the custom verifier is executed and the JSSE + * built-in hostname verifier won't be executed. + */ + @Nullable default HostnameVerifier hostnameVerifier() { - return HttpsSupport.getDefaultHostnameVerifier(); + return null; } default String[] supportedProtocols() { diff --git a/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java b/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java index 8cf97bca32ef..701ae699f136 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestHTTPClient.java @@ -35,22 +35,30 @@ import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.net.SocketTimeoutException; +import java.nio.file.Path; +import java.security.KeyStore; +import java.security.cert.CertificateException; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +import javax.net.ssl.HostnameVerifier; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManagerFactory; import org.apache.hc.client5.http.auth.AuthScope; import org.apache.hc.client5.http.auth.UsernamePasswordCredentials; import org.apache.hc.client5.http.config.ConnectionConfig; import org.apache.hc.client5.http.impl.auth.BasicCredentialsProvider; import org.apache.hc.client5.http.impl.io.PoolingHttpClientConnectionManager; import org.apache.hc.client5.http.io.HttpClientConnectionManager; +import org.apache.hc.client5.http.ssl.NoopHostnameVerifier; import org.apache.hc.core5.http.HttpHost; import org.apache.hc.core5.http.HttpStatus; import org.apache.iceberg.IcebergBuild; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.rest.auth.AuthSession; import org.apache.iceberg.rest.auth.TLSConfigurer; import org.apache.iceberg.rest.responses.ErrorResponse; @@ -58,14 +66,17 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.EnumSource; import org.junit.jupiter.params.provider.ValueSource; import org.mockserver.configuration.Configuration; import org.mockserver.integration.ClientAndServer; +import org.mockserver.logging.MockServerLogger; import org.mockserver.matchers.Times; import org.mockserver.model.HttpRequest; import org.mockserver.model.HttpResponse; +import org.mockserver.socket.tls.KeyStoreFactory; import org.mockserver.verify.VerificationTimes; /** @@ -87,6 +98,7 @@ public class TestHTTPClient { private static RESTClient restClient; public static class DefaultTLSConfigurer implements TLSConfigurer { + public static int count = 0; public DefaultTLSConfigurer() { @@ -95,6 +107,7 @@ public DefaultTLSConfigurer() { } public static class TLSConfigurerMissingNoArgCtor implements TLSConfigurer { + TLSConfigurerMissingNoArgCtor(String str) {} } @@ -395,6 +408,101 @@ public void testLoadTLSConfigurerNotImplementTLSConfigurer() { .hasMessageContaining("does not implement TLSConfigurer"); } + /** A TLSConfigurer that relies on the default (built-in) JSSE verifier. */ + public static class BuiltInHostnameVerifierTLSConfigurer implements TLSConfigurer { + + @Override + public SSLContext sslContext() { + return mockServerSSLContext(); + } + } + + /** A TLSConfigurer that overrides hostnameVerifier() to return a custom verifier. */ + public static class CustomHostnameVerifierTLSConfigurer implements TLSConfigurer { + + @Override + public SSLContext sslContext() { + return mockServerSSLContext(); + } + + @Override + public HostnameVerifier hostnameVerifier() { + return NoopHostnameVerifier.INSTANCE; + } + } + + private static SSLContext mockServerSSLContext() { + try { + KeyStore keyStore = + new KeyStoreFactory(Configuration.configuration(), new MockServerLogger()) + .loadOrCreateKeyStore(); + TrustManagerFactory tmf = + TrustManagerFactory.getInstance(TrustManagerFactory.getDefaultAlgorithm()); + tmf.init(keyStore); + SSLContext sslContext = SSLContext.getInstance("TLSv1.2"); + sslContext.init(null, tmf.getTrustManagers(), null); + return sslContext; + } catch (Exception e) { + throw new RuntimeException("Failed to create SSLContext", e); + } + } + + @Test + public void testTLSConfigurerHostnameVerifier(@TempDir Path temp) throws IOException { + + // Start a dedicated MockServer with a certificate that does NOT include + // 127.0.0.1 or localhost in its SANs. + Configuration tlsConfig = Configuration.configuration(); + tlsConfig.proactivelyInitialiseTLS(true); + tlsConfig.preventCertificateDynamicUpdate(true); + tlsConfig.sslCertificateDomainName("example.com"); + tlsConfig.sslSubjectAlternativeNameIps(Sets.newHashSet("1.2.3.4")); + tlsConfig.sslSubjectAlternativeNameDomains(Sets.newHashSet("example.com")); + tlsConfig.directoryToSaveDynamicSSLCertificate(temp.toFile().getAbsolutePath()); + + int tlsPort = PORT + 1; + try (ClientAndServer server = startClientAndServer(tlsConfig, tlsPort)) { + + String path = "tls/hostname-verifier/path"; + HttpRequest mockRequest = + request() + .withPath("/" + path) + .withMethod(HttpMethod.HEAD.name().toUpperCase(Locale.ROOT)); + HttpResponse mockResponse = response().withStatusCode(200).withBody("TLS response"); + server.when(mockRequest).respond(mockResponse); + + // With no custom hostnameVerifier (null), the BUILTIN policy is used automatically, + // so the JSSE built-in verifier rejects the connection because the SANs don't match + try (HTTPClient builtInVerifierClient = + HTTPClient.builder( + Map.of( + HTTPClient.REST_TLS_CONFIGURER, + BuiltInHostnameVerifierTLSConfigurer.class.getName())) + .uri(String.format("https://127.0.0.1:%d", tlsPort)) + .withAuthSession(AuthSession.EMPTY) + .build()) { + assertThatThrownBy(() -> builtInVerifierClient.head(path, Map.of(), (unused) -> {})) + .rootCause() + .isInstanceOf(CertificateException.class) + .hasMessage("No subject alternative names matching IP address 127.0.0.1 found"); + } + + // With a custom hostnameVerifier (NoopHostnameVerifier), the CLIENT policy is used + // automatically, so hostname verification is bypassed and the request succeeds + try (HTTPClient customVerifierClient = + HTTPClient.builder( + Map.of( + HTTPClient.REST_TLS_CONFIGURER, + CustomHostnameVerifierTLSConfigurer.class.getName())) + .uri(String.format("https://127.0.0.1:%d", tlsPort)) + .withAuthSession(AuthSession.EMPTY) + .build()) { + assertThatCode(() -> customVerifierClient.head(path, Map.of(), (unused) -> {})) + .doesNotThrowAnyException(); + } + } + } + @Test public void testSocketTimeout() throws IOException { long socketTimeoutMs = 2000L; @@ -613,6 +721,7 @@ private static Item doExecuteRequest( } public static class Item implements RESTRequest, RESTResponse { + private Long id; private String data; diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 18bc02e3c023..87ceb9012bd6 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -36,6 +36,7 @@ awaitility = "4.3.0" awssdk-bom = "2.42.33" azuresdk-bom = "1.3.6" awssdk-s3accessgrants = "2.4.1" +bouncycastle = "1.82" bson-ver = "4.11.5" caffeine = "2.9.3" calcite = "1.41.0" @@ -110,6 +111,9 @@ awssdk-bom = { module = "software.amazon.awssdk:bom", version.ref = "awssdk-bom" awssdk-s3accessgrants = { module = "software.amazon.s3.accessgrants:aws-s3-accessgrants-java-plugin", version.ref = "awssdk-s3accessgrants" } azuresdk-bom = { module = "com.azure:azure-sdk-bom", version.ref = "azuresdk-bom" } bson = { module = "org.mongodb:bson", version.ref = "bson-ver"} +bouncycastle-bcpkix = { module = "org.bouncycastle:bcpkix-jdk18on", version.ref = "bouncycastle" } +bouncycastle-bcprov = { module = "org.bouncycastle:bcprov-jdk18on", version.ref = "bouncycastle" } +bouncycastle-bcutil = { module = "org.bouncycastle:bcutil-jdk18on", version.ref = "bouncycastle" } caffeine = { module = "com.github.ben-manes.caffeine:caffeine", version.ref = "caffeine" } calcite-core = { module = "org.apache.calcite:calcite-core", version.ref = "calcite" } calcite-druid = { module = "org.apache.calcite:calcite-druid", version.ref = "calcite" } From bbd121220a79baa18f305a3666530db32065f614 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Tue, 21 Apr 2026 09:46:04 +0800 Subject: [PATCH 078/197] Add .factorypath to .gitignore (#16067) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1f5a8efe37f5..98ccfc356d8b 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ coverage.xml # vscode/eclipse files .classpath +.factorypath .project .settings bin/ From d4db77be070a780cb54b84a6024d6e4ba095bc1e Mon Sep 17 00:00:00 2001 From: drexler-sky Date: Tue, 21 Apr 2026 03:38:48 -0700 Subject: [PATCH 079/197] Spark: Replace deprecated registerTempTable with createOrReplaceTempView (#16063) --- .../org/apache/iceberg/spark/actions/TestCreateActions.java | 2 +- .../org/apache/iceberg/spark/actions/TestCreateActions.java | 2 +- .../org/apache/iceberg/spark/actions/TestCreateActions.java | 2 +- .../org/apache/iceberg/spark/actions/TestCreateActions.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index dd751499df30..9b0fecdaae41 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -667,7 +667,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index eb89b0a23274..50afb53e0539 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -662,7 +662,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index eb89b0a23274..50afb53e0539 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -662,7 +662,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java index eb89b0a23274..50afb53e0539 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestCreateActions.java @@ -662,7 +662,7 @@ public void schemaEvolutionTestWithSparkSQL() throws Exception { "CAST(id AS FLOAT) col1", "CAST(id AS STRING) col2", "CAST(id AS INT) col3") - .registerTempTable("tempdata"); + .createOrReplaceTempView("tempdata"); sql("INSERT INTO TABLE %s SELECT * FROM tempdata", tblName); List expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName); List expectedAfterAddColumn = From 1f0579fb642603887b39a6d55ba79d57be540879 Mon Sep 17 00:00:00 2001 From: Mervyn Lobo Date: Tue, 21 Apr 2026 21:30:24 +0530 Subject: [PATCH 080/197] AWS: Add proxy system property and environment variable configuration for HTTP clients (#15506) --- .../aws/ApacheHttpClientConfigurations.java | 33 +++++++++++++++-- .../iceberg/aws/HttpClientProperties.java | 24 ++++++++++++ ...UrlConnectionHttpClientConfigurations.java | 33 +++++++++++++++-- .../aws/TestHttpClientConfigurations.java | 37 +++++++++++++++++++ docs/docs/aws.md | 10 +++-- 5 files changed, 127 insertions(+), 10 deletions(-) diff --git a/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java b/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java index 3445928d1551..30065c8db510 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java +++ b/aws/src/main/java/org/apache/iceberg/aws/ApacheHttpClientConfigurations.java @@ -41,6 +41,8 @@ class ApacheHttpClientConfigurations extends BaseHttpClientConfigurations { private Boolean tcpKeepAliveEnabled; private Boolean useIdleConnectionReaperEnabled; private String proxyEndpoint; + private Boolean proxyUseSystemPropertyValues; + private Boolean proxyUseEnvironmentVariableValues; private ApacheHttpClientConfigurations() {} @@ -82,6 +84,12 @@ private void initialize(Map httpClientProperties) { this.proxyEndpoint = PropertyUtil.propertyAsString( httpClientProperties, HttpClientProperties.PROXY_ENDPOINT, null); + this.proxyUseSystemPropertyValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES); + this.proxyUseEnvironmentVariableValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES); } @VisibleForTesting @@ -113,9 +121,26 @@ void configureApacheHttpClientBuilder(ApacheHttpClient.Builder apacheHttpClientB if (useIdleConnectionReaperEnabled != null) { apacheHttpClientBuilder.useIdleConnectionReaper(useIdleConnectionReaperEnabled); } - if (proxyEndpoint != null) { - apacheHttpClientBuilder.proxyConfiguration( - ProxyConfiguration.builder().endpoint(URI.create(proxyEndpoint)).build()); + configureProxy(apacheHttpClientBuilder); + } + + private void configureProxy(ApacheHttpClient.Builder apacheHttpClientBuilder) { + if (proxyEndpoint != null + || proxyUseSystemPropertyValues != null + || proxyUseEnvironmentVariableValues != null) { + ProxyConfiguration.Builder proxyBuilder = ProxyConfiguration.builder(); + + if (proxyEndpoint != null) { + proxyBuilder.endpoint(URI.create(proxyEndpoint)); + } + if (proxyUseSystemPropertyValues != null) { + proxyBuilder.useSystemPropertyValues(proxyUseSystemPropertyValues); + } + if (proxyUseEnvironmentVariableValues != null) { + proxyBuilder.useEnvironmentVariableValues(proxyUseEnvironmentVariableValues); + } + + apacheHttpClientBuilder.proxyConfiguration(proxyBuilder.build()); } } @@ -138,6 +163,8 @@ protected String generateHttpClientCacheKey() { keyComponents.put("tcpKeepAliveEnabled", tcpKeepAliveEnabled); keyComponents.put("useIdleConnectionReaperEnabled", useIdleConnectionReaperEnabled); keyComponents.put("proxyEndpoint", proxyEndpoint); + keyComponents.put("proxyUseSystemPropertyValues", proxyUseSystemPropertyValues); + keyComponents.put("proxyUseEnvironmentVariableValues", proxyUseEnvironmentVariableValues); return keyComponents.entrySet().stream() .map(entry -> entry.getKey() + "=" + Objects.toString(entry.getValue(), "null")) diff --git a/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java b/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java index 438ae5bb0431..870d8e23651c 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java +++ b/aws/src/main/java/org/apache/iceberg/aws/HttpClientProperties.java @@ -61,6 +61,30 @@ public class HttpClientProperties implements Serializable { */ public static final String PROXY_ENDPOINT = "http-client.proxy-endpoint"; + /** + * Used to enable reading proxy configuration from Java system properties (http.proxyHost, + * http.proxyPort, http.nonProxyHosts, etc.). Default is true. + * + *

For more details, see + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/urlconnection/ProxyConfiguration.html + * and + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ProxyConfiguration.html + */ + public static final String PROXY_USE_SYSTEM_PROPERTY_VALUES = + "http-client.proxy-use-system-property-values"; + + /** + * Used to enable reading proxy configuration from environment variables (HTTP_PROXY, HTTPS_PROXY, + * NO_PROXY, etc.). Default is true. + * + *

For more details, see + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/urlconnection/ProxyConfiguration.html + * and + * https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ProxyConfiguration.html + */ + public static final String PROXY_USE_ENVIRONMENT_VARIABLE_VALUES = + "http-client.proxy-use-environment-variable-values"; + /** * Used to configure the connection timeout in milliseconds for {@link * software.amazon.awssdk.http.urlconnection.UrlConnectionHttpClient.Builder}. This flag only diff --git a/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java b/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java index 273baa674804..fbd845852ca9 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java +++ b/aws/src/main/java/org/apache/iceberg/aws/UrlConnectionHttpClientConfigurations.java @@ -35,6 +35,8 @@ class UrlConnectionHttpClientConfigurations extends BaseHttpClientConfigurations private Long httpClientUrlConnectionConnectionTimeoutMs; private Long httpClientUrlConnectionSocketTimeoutMs; private String proxyEndpoint; + private Boolean proxyUseSystemPropertyValues; + private Boolean proxyUseEnvironmentVariableValues; private UrlConnectionHttpClientConfigurations() {} @@ -56,6 +58,12 @@ private void initialize(Map httpClientProperties) { this.proxyEndpoint = PropertyUtil.propertyAsString( httpClientProperties, HttpClientProperties.PROXY_ENDPOINT, null); + this.proxyUseSystemPropertyValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES); + this.proxyUseEnvironmentVariableValues = + PropertyUtil.propertyAsNullableBoolean( + httpClientProperties, HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES); } @VisibleForTesting @@ -69,9 +77,26 @@ void configureUrlConnectionHttpClientBuilder( urlConnectionHttpClientBuilder.socketTimeout( Duration.ofMillis(httpClientUrlConnectionSocketTimeoutMs)); } - if (proxyEndpoint != null) { - urlConnectionHttpClientBuilder.proxyConfiguration( - ProxyConfiguration.builder().endpoint(URI.create(proxyEndpoint)).build()); + configureProxy(urlConnectionHttpClientBuilder); + } + + private void configureProxy(UrlConnectionHttpClient.Builder urlConnectionHttpClientBuilder) { + if (proxyEndpoint != null + || proxyUseSystemPropertyValues != null + || proxyUseEnvironmentVariableValues != null) { + ProxyConfiguration.Builder proxyBuilder = ProxyConfiguration.builder(); + + if (proxyEndpoint != null) { + proxyBuilder.endpoint(URI.create(proxyEndpoint)); + } + if (proxyUseSystemPropertyValues != null) { + proxyBuilder.useSystemPropertyValues(proxyUseSystemPropertyValues); + } + if (proxyUseEnvironmentVariableValues != null) { + proxyBuilder.useEnvironmentVariablesValues(proxyUseEnvironmentVariableValues); + } + + urlConnectionHttpClientBuilder.proxyConfiguration(proxyBuilder.build()); } } @@ -87,6 +112,8 @@ protected String generateHttpClientCacheKey() { keyComponents.put("connectionTimeoutMs", httpClientUrlConnectionConnectionTimeoutMs); keyComponents.put("socketTimeoutMs", httpClientUrlConnectionSocketTimeoutMs); keyComponents.put("proxyEndpoint", proxyEndpoint); + keyComponents.put("proxyUseSystemPropertyValues", proxyUseSystemPropertyValues); + keyComponents.put("proxyUseEnvironmentVariableValues", proxyUseEnvironmentVariableValues); return keyComponents.entrySet().stream() .map(entry -> entry.getKey() + "=" + Objects.toString(entry.getValue(), "null")) diff --git a/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java b/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java index 0f96ac0f6c82..da73a5c1b5a5 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java +++ b/aws/src/test/java/org/apache/iceberg/aws/TestHttpClientConfigurations.java @@ -22,6 +22,8 @@ import java.util.Map; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import org.mockito.Mockito; import software.amazon.awssdk.http.apache.ApacheHttpClient; import software.amazon.awssdk.http.apache.ProxyConfiguration; @@ -137,4 +139,39 @@ public void testApacheDefaultConfigurations() { Mockito.verify(spyApacheHttpClientBuilder, Mockito.never()) .proxyConfiguration(Mockito.any(ProxyConfiguration.class)); } + + @ParameterizedTest + @ValueSource( + strings = { + HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES, + HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES + }) + public void testApacheProxyFlagTriggersProxyConfig(String propertyKey) { + Map properties = Maps.newHashMap(); + properties.put(propertyKey, "false"); + ApacheHttpClient.Builder spy = Mockito.spy(ApacheHttpClient.builder()); + + ApacheHttpClientConfigurations.create(properties).configureApacheHttpClientBuilder(spy); + + Mockito.verify(spy).proxyConfiguration(Mockito.any(ProxyConfiguration.class)); + } + + @ParameterizedTest + @ValueSource( + strings = { + HttpClientProperties.PROXY_USE_SYSTEM_PROPERTY_VALUES, + HttpClientProperties.PROXY_USE_ENVIRONMENT_VARIABLE_VALUES + }) + public void testUrlConnectionProxyFlagTriggersProxyConfig(String propertyKey) { + Map properties = Maps.newHashMap(); + properties.put(propertyKey, "false"); + UrlConnectionHttpClient.Builder spy = Mockito.spy(UrlConnectionHttpClient.builder()); + + UrlConnectionHttpClientConfigurations.create(properties) + .configureUrlConnectionHttpClientBuilder(spy); + + Mockito.verify(spy) + .proxyConfiguration( + Mockito.any(software.amazon.awssdk.http.urlconnection.ProxyConfiguration.class)); + } } diff --git a/docs/docs/aws.md b/docs/docs/aws.md index 1fe867401296..587de402b069 100644 --- a/docs/docs/aws.md +++ b/docs/docs/aws.md @@ -705,10 +705,12 @@ For more details of configuration, see sections [URL Connection HTTP Client Conf Configurations for the HTTP client can be set via catalog properties. Below is an overview of available configurations: -| Property | Default | Description | -|----------------------------|---------|------------------------------------------------------------------------------------------------------------| -| http-client.type | apache | Types of HTTP Client.
`urlconnection`: URL Connection HTTP Client
`apache`: Apache HTTP Client | -| http-client.proxy-endpoint | null | An optional proxy endpoint to use for the HTTP client. | +| Property | Default | Description | +|---------------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| http-client.type | apache | Types of HTTP Client.
`urlconnection`: URL Connection HTTP Client
`apache`: Apache HTTP Client | +| http-client.proxy-endpoint | null | An optional proxy endpoint to use for the HTTP client. | +| http-client.proxy-use-system-property-values | null, enabled by default | An optional `true/false` setting that controls whether proxy configuration is read from Java system properties (`http.proxyHost`, `http.proxyPort`, `http.nonProxyHosts`, etc.). | +| http-client.proxy-use-environment-variable-values | null, enabled by default | An optional `true/false` setting that controls whether proxy configuration is read from environment variables (`HTTP_PROXY`, `HTTPS_PROXY`, `NO_PROXY`, etc.). | #### URL Connection HTTP Client Configurations From a675f88b0d8b1bc5b9f37bc8db304bd694cf5f18 Mon Sep 17 00:00:00 2001 From: kumarpritam863 <148938310+kumarpritam863@users.noreply.github.com> Date: Tue, 21 Apr 2026 23:38:50 +0530 Subject: [PATCH 081/197] Kafka Connect: Do not fail if no partitions assigned (#15955) --------- Co-authored-by: Pritam Kumar Mishra Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../connect/channel/CommitterImpl.java | 72 +++++++++++-------- .../iceberg/connect/channel/Coordinator.java | 32 +++++---- 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java index 04602a66a5e1..7b2d4a25363d 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/CommitterImpl.java @@ -30,7 +30,6 @@ import org.apache.kafka.clients.admin.ConsumerGroupDescription; import org.apache.kafka.clients.admin.MemberDescription; import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.sink.SinkRecord; import org.apache.kafka.connect.sink.SinkTaskContext; import org.slf4j.Logger; @@ -48,6 +47,7 @@ public class CommitterImpl implements Committer { private KafkaClientFactory clientFactory; private Collection membersWhenWorkerIsCoordinator; private final AtomicBoolean isInitialized = new AtomicBoolean(false); + private String taskId; private void initialize( Catalog icebergCatalog, @@ -58,6 +58,7 @@ private void initialize( this.config = icebergSinkConfig; this.context = sinkTaskContext; this.clientFactory = new KafkaClientFactory(config.kafkaProps()); + this.taskId = config.connectorName() + "-" + config.taskId(); } } @@ -92,16 +93,38 @@ boolean hasLeaderPartition(Collection currentAssignedPartitions) @VisibleForTesting boolean containsFirstPartition( Collection members, Collection partitions) { - // there should only be one task assigned partition 0 of the first topic, - // so elect that one the leader - TopicPartition firstTopicPartition = - members.stream() - .flatMap(member -> member.assignment().topicPartitions().stream()) - .min(new TopicPartitionComparator()) - .orElseThrow( - () -> new ConnectException("No partitions assigned, cannot determine leader")); - - return partitions.contains(firstTopicPartition); + // Determine the first partition across all members to elect the leader + TopicPartition firstTopicPartition = findFirstTopicPartition(members); + + if (firstTopicPartition == null) { + LOG.warn( + "Committer {} found no partitions assigned across all members, cannot determine leader", + taskId); + return false; + } + + boolean containsFirst = partitions.contains(firstTopicPartition); + if (containsFirst) { + LOG.info( + "Committer {} contains the first partition {}, this task is the leader", + taskId, + firstTopicPartition); + } else { + LOG.debug( + "Committer {} does not contain the first partition {}, not the leader", + taskId, + firstTopicPartition); + } + + return containsFirst; + } + + @VisibleForTesting + TopicPartition findFirstTopicPartition(Collection members) { + return members.stream() + .flatMap(member -> member.assignment().topicPartitions().stream()) + .min(new TopicPartitionComparator()) + .orElse(null); } @Override @@ -122,7 +145,7 @@ public void open( Collection addedPartitions) { initialize(icebergCatalog, icebergSinkConfig, sinkTaskContext); if (hasLeaderPartition(addedPartitions)) { - LOG.info("Committer received leader partition. Starting Coordinator."); + LOG.info("Committer {} received leader partition. Starting Coordinator.", taskId); startCoordinator(); } } @@ -141,31 +164,25 @@ public void close(Collection closedPartitions) { // Defensive: close called without prior initialization (should not happen). if (!isInitialized.get()) { - LOG.warn("Close unexpectedly called without partition assignment"); + LOG.warn("Close unexpectedly called on committer {} without partition assignment", taskId); return; } // Empty partitions → task was stopped explicitly. Stop coordinator if running. if (closedPartitions.isEmpty()) { - LOG.info("Task stopped. Closing coordinator."); + LOG.info("Committer {} stopped. Closing coordinator.", taskId); stopCoordinator(); return; } // Normal close: if leader partition is lost, stop coordinator. if (hasLeaderPartition(closedPartitions)) { - LOG.info( - "Committer {}-{} lost leader partition. Stopping coordinator.", - config.connectorName(), - config.taskId()); + LOG.info("Committer {} lost leader partition. Stopping coordinator.", taskId); stopCoordinator(); } // Reset offsets to last committed to avoid data loss. - LOG.info( - "Seeking to last committed offsets for worker {}-{}.", - config.connectorName(), - config.taskId()); + LOG.info("Seeking to last committed offsets for worker {}.", taskId); KafkaUtils.seekToLastCommittedOffsets(context); } @@ -181,9 +198,7 @@ public void save(Collection sinkRecords) { private void processControlEvents() { if (coordinatorThread != null && coordinatorThread.isTerminated()) { throw new NotRunningException( - String.format( - "Coordinator unexpectedly terminated on committer %s-%s", - config.connectorName(), config.taskId())); + String.format("Coordinator unexpectedly terminated on committer %s", taskId)); } if (worker != null) { worker.process(); @@ -192,7 +207,7 @@ private void processControlEvents() { private void startWorker() { if (null == this.worker) { - LOG.info("Starting commit worker {}-{}", config.connectorName(), config.taskId()); + LOG.info("Starting commit worker {}", taskId); SinkWriter sinkWriter = new SinkWriter(catalog, config); worker = new Worker(config, clientFactory, sinkWriter, context); worker.start(); @@ -201,10 +216,7 @@ private void startWorker() { private void startCoordinator() { if (null == this.coordinatorThread) { - LOG.info( - "Task {}-{} elected leader, starting commit coordinator", - config.connectorName(), - config.taskId()); + LOG.info("Task {} elected leader, starting commit coordinator", taskId); Coordinator coordinator = new Coordinator(catalog, config, membersWhenWorkerIsCoordinator, clientFactory, context); coordinatorThread = new CoordinatorThread(coordinator); diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java index 068e1e1f6e9c..c986f8afc2eb 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/channel/Coordinator.java @@ -81,6 +81,7 @@ class Coordinator extends Channel { private final ExecutorService exec; private final CommitState commitState; private volatile boolean terminated; + private final String taskId; Coordinator( Catalog catalog, @@ -110,6 +111,7 @@ class Coordinator extends Channel { .setNameFormat("iceberg-committer" + "-%d") .build()); this.commitState = new CommitState(config); + this.taskId = config.connectorName() + "-" + config.taskId(); } void process() { @@ -119,7 +121,7 @@ void process() { Event event = new Event(config.connectGroupId(), new StartCommit(commitState.currentCommitId())); send(event); - LOG.info("Commit {} initiated", commitState.currentCommitId()); + LOG.info("Coordinator {} initiated commit {}", taskId, commitState.currentCommitId()); } consumeAvailable(POLL_DURATION); @@ -149,7 +151,11 @@ private void commit(boolean partialCommit) { try { doCommit(partialCommit); } catch (Exception e) { - LOG.warn("Commit failed, will try again next cycle", e); + LOG.warn( + "Coordinator {} failed to commit for commit {}, will try again next cycle", + taskId, + commitState.currentCommitId(), + e); } finally { commitState.endCurrentCommit(); } @@ -163,10 +169,9 @@ private void doCommit(boolean partialCommit) { .executeWith(exec) .stopOnFailure() .run( - entry -> { - commitToTable( - entry.getKey(), entry.getValue(), controlTopicOffsets(), validThroughTs); - }); + entry -> + commitToTable( + entry.getKey(), entry.getValue(), controlTopicOffsets(), validThroughTs)); // we should only get here if all tables committed successfully... commitConsumerOffsets(); @@ -179,7 +184,8 @@ private void doCommit(boolean partialCommit) { send(event); LOG.info( - "Commit {} complete, committed to {} table(s), valid-through {}", + "Coordinator {} completed commit {}, committed to {} table(s), valid-through {}", + taskId, commitState.currentCommitId(), commitMap.size(), validThroughTs); @@ -256,13 +262,14 @@ private void commitToTable( .collect(Collectors.toList()); if (terminated) { - throw new ConnectException("Coordinator is terminated, commit aborted"); + throw new ConnectException( + String.format("Coordinator %s is terminated, commit aborted", taskId)); } if (dataFiles.isEmpty() && deleteFiles.isEmpty()) { - LOG.info("Nothing to commit to table {}, skipping", tableIdentifier); + LOG.info( + "Coordinator {} found nothing to commit to table {}, skipping", taskId, tableIdentifier); } else { - String taskId = String.format("%s-%s", config.connectorName(), config.taskId()); if (deleteFiles.isEmpty()) { AppendFiles appendOp = table.newAppend().validateWith(offsetValidator(tableIdentifier, committedOffsets)); @@ -303,7 +310,8 @@ private void commitToTable( send(event); LOG.info( - "Commit complete to table {}, snapshot {}, commit ID {}, valid-through {}", + "Coordinator {} completed commit to table {}, snapshot {}, commit ID {}, valid-through {}", + taskId, tableIdentifier, snapshotId, commitState.currentCommitId(), @@ -372,7 +380,7 @@ private Map parseOffsets(String value) { return Map.of(); } - TypeReference> typeRef = new TypeReference>() {}; + TypeReference> typeRef = new TypeReference<>() {}; try { return MAPPER.readValue(value, typeRef); } catch (IOException e) { From 5dae5fc856e228cd8e1b69ba14909d0067006650 Mon Sep 17 00:00:00 2001 From: alpbeysir <94457197+alpbeysir@users.noreply.github.com> Date: Tue, 21 Apr 2026 22:59:33 +0300 Subject: [PATCH 082/197] Core: Use Stream overload for reading response in HTTPClient (#15648) --- .../org/apache/iceberg/rest/HTTPClient.java | 22 +++++-------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java index c359404ec6be..46d9177b9571 100644 --- a/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java +++ b/core/src/main/java/org/apache/iceberg/rest/HTTPClient.java @@ -18,7 +18,6 @@ */ package org.apache.iceberg.rest; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; import java.io.IOException; @@ -342,32 +341,23 @@ protected T execute( return null; } - String responseBody = extractResponseBodyAsString(response); - if (!isSuccessful(response)) { // The provided error handler is expected to throw, but a RESTException is thrown if not. + String responseBody = extractResponseBodyAsString(response); throwFailure(response, responseBody, errorHandler); } - if (responseBody == null) { + if (response.getEntity() == null) { throw new RESTException( "Invalid (null) response body for request (expected %s): method=%s, path=%s, status=%d", responseType.getSimpleName(), req.method(), req.path(), response.getCode()); } - try { - ObjectReader reader = objectReaderCache.computeIfAbsent(responseType, mapper::readerFor); - if (parserContext != null && !parserContext.isEmpty()) { - reader = reader.with(parserContext.toInjectableValues()); - } - return reader.readValue(responseBody); - } catch (JsonProcessingException e) { - throw new RESTException( - e, - "Received a success response code of %d, but failed to parse response body into %s", - response.getCode(), - responseType.getSimpleName()); + ObjectReader reader = objectReaderCache.computeIfAbsent(responseType, mapper::readerFor); + if (parserContext != null && !parserContext.isEmpty()) { + reader = reader.with(parserContext.toInjectableValues()); } + return reader.readValue(response.getEntity().getContent()); } catch (IOException e) { throw new RESTException(e, "Error occurred while processing %s request", req.method()); } From 017ba2f6fc78c12f1aff81b9f8f8a28513a5703f Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Wed, 22 Apr 2026 22:33:48 +0900 Subject: [PATCH 083/197] Spark: Fix RoaringBitmap version in runtime-deps.txt (#16076) --- spark/v4.1/spark-runtime/runtime-deps.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark/v4.1/spark-runtime/runtime-deps.txt b/spark/v4.1/spark-runtime/runtime-deps.txt index e275e24372af..9a087517cbb0 100644 --- a/spark/v4.1/spark-runtime/runtime-deps.txt +++ b/spark/v4.1/spark-runtime/runtime-deps.txt @@ -36,5 +36,5 @@ org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 org.locationtech.jts:jts-core:1.20.0 org.projectnessie.nessie:nessie-client:0.107.4 org.projectnessie.nessie:nessie-model:0.107.4 -org.roaringbitmap:RoaringBitmap:1.6.13 +org.roaringbitmap:RoaringBitmap:1.6.14 org.threeten:threeten-extra:1.7.1 From 4a31e519b8782d1c5a98f2ad00edbe278cedc4a6 Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Wed, 22 Apr 2026 06:52:29 -0700 Subject: [PATCH 084/197] Core: Use Idiomatic ThreadLocal cleanup in CommitMetadata (#15284) (#16031) Replace COMMIT_PROPERTIES.set(ImmutableMap.of()) with COMMIT_PROPERTIES.remove() in the finally block of withCommitProperties(). remove() is the recommended cleanup pattern per the ThreadLocal javadoc. Co-authored-by: Anupam Yadav --- .../src/main/java/org/apache/iceberg/spark/CommitMetadata.java | 2 +- .../src/main/java/org/apache/iceberg/spark/CommitMetadata.java | 2 +- .../src/main/java/org/apache/iceberg/spark/CommitMetadata.java | 2 +- .../src/main/java/org/apache/iceberg/spark/CommitMetadata.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java index ea400a779235..cb9da3edc678 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/CommitMetadata.java @@ -56,7 +56,7 @@ public static R withCommitProperties( ExceptionUtil.castAndThrow(e, exClass); return null; } finally { - COMMIT_PROPERTIES.set(ImmutableMap.of()); + COMMIT_PROPERTIES.remove(); } } From 5f80bc3a38d582fe352969a9b2ae037de048c843 Mon Sep 17 00:00:00 2001 From: Yingjian Wu <151791653+yingjianwu98@users.noreply.github.com> Date: Wed, 22 Apr 2026 10:13:55 -0700 Subject: [PATCH 085/197] Spark: fix delete from branch for canDeleteWhere where it does not resolve to the correct branch (#15512) --- .../iceberg/spark/extensions/TestDelete.java | 56 +++++++++++++++++++ .../iceberg/spark/source/SparkTable.java | 20 +++++-- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index 7e0f6207edc9..9e9d751691be 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1424,6 +1424,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 07db8c4ed3fe..80a40d72c8d1 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -49,6 +49,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.TimeTravel; @@ -208,11 +209,14 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + String scanBranch = + SparkTableUtil.determineReadBranch( + spark(), table(), branch, CaseInsensitiveStringMap.empty()); + return canDeleteUsingMetadata(deleteExpr, scanBranch); } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(spark()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -227,7 +231,9 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (snapshot != null) { + if (scanBranch != null) { + scan = scan.useRef(scanBranch); + } else if (snapshot != null) { scan = scan.useSnapshot(snapshot.snapshotId()); } @@ -269,8 +275,12 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", spark().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); - if (branch != null) { - deleteFiles.toBranch(branch); + String writeBranch = + SparkTableUtil.determineWriteBranch( + spark(), table(), branch, CaseInsensitiveStringMap.empty()); + + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { From 844a0abc3b6f7ec3164c4d60b26973eb686a27db Mon Sep 17 00:00:00 2001 From: "seokyun.ha" Date: Thu, 23 Apr 2026 03:45:50 +0900 Subject: [PATCH 086/197] Kafka Connect: Support VARIANT when record convert (#15283) * feat: Implement support for VARIANT type in RecordConverter with conversion methods for nested structures --------- Co-authored-by: Brandon Stanley --- .../iceberg/connect/data/RecordConverter.java | 244 ++++++++++++ .../connect/data/TestRecordConverter.java | 367 ++++++++++++++++++ 2 files changed, 611 insertions(+) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index 1a57a6444870..51f64a9d4b05 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -22,23 +22,29 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.math.BigDecimal; +import java.math.BigInteger; import java.math.RoundingMode; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; +import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; import java.time.LocalTime; import java.time.OffsetDateTime; import java.time.ZoneOffset; +import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatterBuilder; import java.time.format.DateTimeParseException; import java.time.temporal.Temporal; import java.util.Base64; +import java.util.Collection; +import java.util.Collections; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.UUID; import java.util.stream.Collectors; import org.apache.iceberg.FileFormat; @@ -53,6 +59,7 @@ import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Type.PrimitiveType; import org.apache.iceberg.types.Types.DecimalType; @@ -64,6 +71,13 @@ import org.apache.iceberg.util.ByteBuffers; import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.UUIDUtil; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.ValueArray; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.iceberg.variants.Variants; +import org.apache.kafka.connect.data.Field; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; @@ -142,6 +156,8 @@ private Object convertValue( return convertTimeValue(value); case TIMESTAMP: return convertTimestampValue(value, (TimestampType) type); + case VARIANT: + return convertVariantValue(value); } throw new UnsupportedOperationException("Unsupported type: " + type.typeId()); } @@ -464,6 +480,234 @@ protected Temporal convertTimestampValue(Object value, TimestampType type) { return convertLocalDateTime(value); } + protected Variant convertVariantValue(Object value) { + if (value instanceof Variant variant) { + return variant; + } + + List sortedFieldNames = + collectFieldNames(value).stream().sorted().collect(Collectors.toList()); + VariantMetadata metadata = Variants.metadata(sortedFieldNames); + return Variant.of(metadata, objectToVariantValue(value, metadata, null)); + } + + /** + * Recursively collects field names from collections, maps, and structs. Returns an empty set for + * null, scalar values, and empty maps, lists, or structs. Map keys must be strings; non-string + * keys cause IllegalArgumentException. + */ + private static Set collectFieldNames(Object value) { + if (value == null) { + return Collections.emptySet(); + } + if (value instanceof Collection collection) { + if (collection.isEmpty()) { + return Collections.emptySet(); + } + Set names = Sets.newHashSet(); + collection.forEach(element -> names.addAll(collectFieldNames(element))); + return names; + } else if (value instanceof Map map) { + if (map.isEmpty()) { + return Collections.emptySet(); + } + Set names = Sets.newHashSet(); + map.forEach( + (key, val) -> { + if (key instanceof String keyStr) { + names.add(keyStr); + names.addAll(collectFieldNames(val)); + } else { + throw new IllegalArgumentException( + "Cannot convert map to variant: keys must be non-null strings, was: " + + (key == null ? "null" : key.getClass().getName())); + } + }); + return names; + } else if (value instanceof Struct struct) { + List fields = struct.schema().fields(); + if (fields.isEmpty()) { + return Collections.emptySet(); + } + Set names = Sets.newHashSet(); + fields.forEach( + field -> { + names.add(field.name()); + names.addAll(collectFieldNames(struct.get(field))); + }); + return names; + } + return Collections.emptySet(); + } + + /** + * Recursively converts a Java object to a VariantValue using the given shared metadata for all + * nested maps. Handles primitives, List (array), and Map (object); map keys become field names. + */ + private static VariantValue objectToVariantValue( + Object value, VariantMetadata metadata, org.apache.kafka.connect.data.Schema schema) { + if (value == null) { + return Variants.ofNull(); + } + VariantValue primitive = primitiveToVariantValue(value, schema); + if (primitive != null) { + return primitive; + } + if (value instanceof Collection collection) { + ValueArray array = Variants.array(); + org.apache.kafka.connect.data.Schema elementSchema = + schema != null ? schema.valueSchema() : null; + for (Object element : collection) { + array.add(objectToVariantValue(element, metadata, elementSchema)); + } + return array; + } + if (value instanceof Map map) { + return mapToVariantValue(map, metadata, schema); + } + if (value instanceof Struct struct) { + ShreddedObject object = Variants.object(metadata); + for (Field field : struct.schema().fields()) { + object.put(field.name(), objectToVariantValue(struct.get(field), metadata, field.schema())); + } + return object; + } + throw new IllegalArgumentException("Cannot convert to variant: " + value.getClass().getName()); + } + + /** Converts a Map to VariantValue; throw IllegalArgumentException if the key is not a string. */ + private static VariantValue mapToVariantValue( + Map map, VariantMetadata metadata, org.apache.kafka.connect.data.Schema schema) { + ShreddedObject object = Variants.object(metadata); + org.apache.kafka.connect.data.Schema mapValueSchema = + schema != null ? schema.valueSchema() : null; + map.forEach( + (key, val) -> { + if (key instanceof String keyStr) { + object.put(keyStr, objectToVariantValue(val, metadata, mapValueSchema)); + } else { + throw new IllegalArgumentException( + "Cannot convert map to variant: keys must be non-null strings, was: " + + (key == null ? "null" : key.getClass().getName())); + } + }); + return object; + } + + /** + * Converts a primitive or primitive-like value to VariantValue; returns null if not supported. + * The optional schema is used to disambiguate java.util.Date which Kafka Connect uses for Date, + * Time, and Timestamp logical types. + */ + private static VariantValue primitiveToVariantValue( + Object value, org.apache.kafka.connect.data.Schema schema) { + if (value instanceof Boolean booleanValue) { + return Variants.of(booleanValue); + } + VariantValue temporal = temporalObjectToVariantValue(value, schema); + if (temporal != null) { + return temporal; + } + if (value instanceof Number number) { + return numberToVariantValue(number); + } + if (value instanceof String stringValue) { + return Variants.of(stringValue); + } + if (value instanceof ByteBuffer byteBuffer) { + return Variants.of(byteBuffer); + } + if (value instanceof byte[] byteArray) { + return Variants.of(ByteBuffer.wrap(byteArray)); + } + if (value instanceof UUID uuid) { + return Variants.ofUUID(uuid); + } + return null; + } + + /** + * Converts java.time values and java.util.Date (with Connect logical type from the optional + * schema) to VariantValue; returns null if the value is not a supported temporal representation. + */ + private static VariantValue temporalObjectToVariantValue( + Object value, org.apache.kafka.connect.data.Schema schema) { + if (value instanceof Instant instant) { + return Variants.ofTimestamptz(DateTimeUtil.microsFromInstant(instant)); + } + if (value instanceof OffsetDateTime offsetDateTime) { + return Variants.ofTimestamptz(DateTimeUtil.microsFromTimestamptz(offsetDateTime)); + } + if (value instanceof ZonedDateTime zonedDateTime) { + return Variants.ofTimestamptz( + DateTimeUtil.microsFromTimestamptz(zonedDateTime.toOffsetDateTime())); + } + if (value instanceof LocalDateTime localDateTime) { + return Variants.ofTimestampntz(DateTimeUtil.microsFromTimestamp(localDateTime)); + } + if (value instanceof LocalDate localDate) { + return Variants.ofDate(DateTimeUtil.daysFromDate(localDate)); + } + if (value instanceof LocalTime localTime) { + return Variants.ofTime(DateTimeUtil.microsFromTime(localTime)); + } + if (value instanceof Date date) { + String logicalName = schema != null ? schema.name() : null; + // Connect represents Timestamp, Time, and Date logical types as java.util.Date at runtime; + // normalize to Instant once, then interpret using the schema logical type name. + Instant connectInstant = date.toInstant(); + if (org.apache.kafka.connect.data.Timestamp.LOGICAL_NAME.equals(logicalName)) { + return Variants.ofTimestamptz(DateTimeUtil.microsFromInstant(connectInstant)); + } + if (org.apache.kafka.connect.data.Time.LOGICAL_NAME.equals(logicalName)) { + LocalTime utcTime = connectInstant.atZone(ZoneOffset.UTC).toLocalTime(); + return Variants.ofTime(DateTimeUtil.microsFromTime(utcTime)); + } + if (org.apache.kafka.connect.data.Date.LOGICAL_NAME.equals(logicalName)) { + return Variants.ofDate(DateTimeUtil.daysFromInstant(connectInstant)); + } + throw new IllegalArgumentException( + "Cannot convert java.util.Date to variant without a recognized logical type schema" + + " (expected Timestamp, Time, or Date but got: " + + logicalName + + ")"); + } + return null; + } + + /** + * Converts a Number to VariantValue; throw IllegalArgumentException if the value is not a + * supported number representation. + */ + private static VariantValue numberToVariantValue(Number number) { + if (number instanceof BigDecimal bigDecimal) { + return Variants.of(bigDecimal); + } + if (number instanceof BigInteger bigInteger) { + return Variants.of(new BigDecimal(bigInteger)); + } + if (number instanceof Integer integer) { + return Variants.of(integer); + } + if (number instanceof Long longValue) { + return Variants.of(longValue); + } + if (number instanceof Float floatValue) { + return Variants.of(floatValue); + } + if (number instanceof Double doubleValue) { + return Variants.of(doubleValue); + } + if (number instanceof Byte byteValue) { + return Variants.of(byteValue); + } + if (number instanceof Short shortValue) { + return Variants.of(shortValue); + } + throw new IllegalArgumentException( + "Cannot convert Number to variant (unknown type): " + number.getClass().getName()); + } + @SuppressWarnings("JavaUtilDate") private OffsetDateTime convertOffsetDateTime(Object value) { if (value instanceof Number) { diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java index 45d07f69591b..56a9b6e100ac 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java @@ -32,6 +32,7 @@ import java.time.LocalTime; import java.time.OffsetDateTime; import java.time.ZoneOffset; +import java.time.ZonedDateTime; import java.time.temporal.Temporal; import java.util.Base64; import java.util.Collection; @@ -74,7 +75,12 @@ import org.apache.iceberg.types.Types.TimeType; import org.apache.iceberg.types.Types.TimestampType; import org.apache.iceberg.types.Types.UUIDType; +import org.apache.iceberg.types.Types.VariantType; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.UUIDUtil; +import org.apache.iceberg.variants.PhysicalType; +import org.apache.iceberg.variants.Variant; +import org.apache.iceberg.variants.VariantValue; import org.apache.kafka.connect.data.Decimal; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; @@ -152,6 +158,9 @@ public class TestRecordConverter { NestedField.required( 100, "stma", MapType.ofRequired(101, 102, StringType.get(), ID_SCHEMA.asStruct()))); + private static final org.apache.iceberg.Schema VARIANT_SCHEMA = + new org.apache.iceberg.Schema(NestedField.required(1, "v", VariantType.get())); + private static final Schema CONNECT_SCHEMA = SchemaBuilder.struct() .field("i", Schema.INT32_SCHEMA) @@ -881,6 +890,364 @@ public void testEvolveTypeDetectionStructNested() { assertThat(updateMap.get("st.ff").type()).isInstanceOf(DoubleType.class); } + private RecordConverter variantConverter() { + Table table = mock(Table.class); + when(table.schema()).thenReturn(VARIANT_SCHEMA); + return new RecordConverter(table, config); + } + + @Test + public void testConvertVariantValueFromNull() { + Variant variant = variantConverter().convertVariantValue(null); + assertThat(variant).isNotNull(); + assertThat(variant.value().type()).isEqualTo(PhysicalType.NULL); + } + + @Test + public void testConvertVariantValuePassThrough() { + Variant original = variantConverter().convertVariantValue("hello"); + assertThat(variantConverter().convertVariantValue(original)).isSameAs(original); + } + + @Test + public void testConvertVariantValueFromPrimitiveString() { + Variant variant = variantConverter().convertVariantValue("hello"); + assertThat(variant).isNotNull(); + assertThat(variant.metadata()).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.STRING); + assertThat(variant.value().asPrimitive().get()).isEqualTo("hello"); + } + + @Test + public void testConvertVariantValueFromPrimitiveNumber() { + Variant variant = variantConverter().convertVariantValue(123); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.INT32); + assertThat(variant.value().asPrimitive().get()).isEqualTo(123); + } + + @Test + public void testConvertVariantValueFromBoolean() { + Variant variant = variantConverter().convertVariantValue(true); + assertThat(variant).isNotNull(); + assertThat(variant.value().type()).isEqualTo(PhysicalType.BOOLEAN_TRUE); + assertThat(variant.value().asPrimitive().get()).isEqualTo(true); + } + + @Test + public void testConvertVariantValueFromInstant() { + Instant instant = Instant.parse("2025-04-04T12:34:56.789Z"); + Variant variant = variantConverter().convertVariantValue(instant); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromInstant(instant)); + } + + @Test + public void testConvertVariantValueFromOffsetDateTime() { + OffsetDateTime odt = OffsetDateTime.parse("2025-04-04T12:34:56.789+09:00"); + Variant variant = variantConverter().convertVariantValue(odt); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(odt)); + } + + @Test + public void testConvertVariantValueFromZonedDateTime() { + ZonedDateTime zdt = ZonedDateTime.parse("2025-04-04T12:34:56.789-05:00[America/New_York]"); + Variant variant = variantConverter().convertVariantValue(zdt); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(zdt.toOffsetDateTime())); + } + + @Test + public void testConvertVariantValueFromLocalDateTime() { + LocalDateTime ldt = LocalDateTime.parse("2025-04-04T12:34:56.789"); + Variant variant = variantConverter().convertVariantValue(ldt); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIMESTAMPNTZ); + assertThat(variant.value().asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamp(ldt)); + } + + @Test + public void testConvertVariantValueFromLocalDate() { + LocalDate date = LocalDate.of(2025, 4, 4); + Variant variant = variantConverter().convertVariantValue(date); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.DATE); + assertThat(variant.value().asPrimitive().get()).isEqualTo(DateTimeUtil.daysFromDate(date)); + } + + @Test + public void testConvertVariantValueFromLocalTime() { + LocalTime time = LocalTime.of(12, 34, 56, 789_000_000); + Variant variant = variantConverter().convertVariantValue(time); + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(0); + assertThat(variant.value().type()).isEqualTo(PhysicalType.TIME); + assertThat(variant.value().asPrimitive().get()).isEqualTo(DateTimeUtil.microsFromTime(time)); + } + + @Test + public void testConvertVariantValueFromList() { + // array with heterogeneous element types (string, int, boolean, double, null, nested array/map, + // java.time primitives). Note: java.util.Date is not supported without Connect logical schema. + Instant instant = Instant.parse("2025-04-04T12:34:56.789Z"); + OffsetDateTime offsetTs = OffsetDateTime.parse("2025-04-04T12:34:56.789+09:00"); + ZonedDateTime zonedTs = ZonedDateTime.parse("2025-04-04T12:34:56.789-05:00[America/New_York]"); + LocalDateTime localTs = LocalDateTime.parse("2025-04-04T12:34:56.789"); + LocalDate localDate = LocalDate.of(2025, 4, 4); + LocalTime localTime = LocalTime.of(12, 34, 56, 789_000_000); + + List input = + Lists.newArrayList( + "a", + 1, + true, + 2.5, + null, + ImmutableList.of("a", "b"), + ImmutableMap.of("key1", "value1", "key2", "value2"), + instant, + offsetTs, + zonedTs, + localTs, + localDate, + localTime); + Variant variant = variantConverter().convertVariantValue(input); + + assertThat(variant).isNotNull(); + assertThat(variant.value().type()).isEqualTo(PhysicalType.ARRAY); + assertThat(variant.value().asArray().numElements()).isEqualTo(13); + + assertThat(variant.value().asArray().get(0).type()).isEqualTo(PhysicalType.STRING); + assertThat(variant.value().asArray().get(0).asPrimitive().get()).isEqualTo("a"); + + assertThat(variant.value().asArray().get(1).type()).isEqualTo(PhysicalType.INT32); + assertThat(variant.value().asArray().get(1).asPrimitive().get()).isEqualTo(1); + + assertThat(variant.value().asArray().get(2).type()).isEqualTo(PhysicalType.BOOLEAN_TRUE); + assertThat(variant.value().asArray().get(2).asPrimitive().get()).isEqualTo(true); + + assertThat(variant.value().asArray().get(3).type()).isEqualTo(PhysicalType.DOUBLE); + assertThat(variant.value().asArray().get(3).asPrimitive().get()).isEqualTo(2.5); + + assertThat(variant.value().asArray().get(4).type()).isEqualTo(PhysicalType.NULL); + + assertThat(variant.value().asArray().get(5).type()).isEqualTo(PhysicalType.ARRAY); + assertThat(variant.value().asArray().get(5).asArray().numElements()).isEqualTo(2); + assertThat(variant.value().asArray().get(5).asArray().get(0).asPrimitive().get()) + .isEqualTo("a"); + assertThat(variant.value().asArray().get(5).asArray().get(1).asPrimitive().get()) + .isEqualTo("b"); + + assertThat(variant.value().asArray().get(6).type()).isEqualTo(PhysicalType.OBJECT); + assertThat(variant.value().asArray().get(6).asObject().numFields()).isEqualTo(2); + assertThat(variant.value().asArray().get(6).asObject().get("key1").asPrimitive().get()) + .isEqualTo("value1"); + assertThat(variant.value().asArray().get(6).asObject().get("key2").asPrimitive().get()) + .isEqualTo("value2"); + + assertThat(variant.value().asArray().get(7).type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asArray().get(7).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromInstant(instant)); + + assertThat(variant.value().asArray().get(8).type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asArray().get(8).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(offsetTs)); + + assertThat(variant.value().asArray().get(9).type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asArray().get(9).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(zonedTs.toOffsetDateTime())); + + assertThat(variant.value().asArray().get(10).type()).isEqualTo(PhysicalType.TIMESTAMPNTZ); + assertThat(variant.value().asArray().get(10).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamp(localTs)); + + assertThat(variant.value().asArray().get(11).type()).isEqualTo(PhysicalType.DATE); + assertThat(variant.value().asArray().get(11).asPrimitive().get()) + .isEqualTo(DateTimeUtil.daysFromDate(localDate)); + + assertThat(variant.value().asArray().get(12).type()).isEqualTo(PhysicalType.TIME); + assertThat(variant.value().asArray().get(12).asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTime(localTime)); + } + + @Test + public void testConvertVariantValueFromMap() { + // heterogeneous top-level values, nested map, java.time primitives; + // metadata shares one sorted dictionary for the whole tree + Instant instant = Instant.parse("2025-04-04T12:34:56.789Z"); + OffsetDateTime offsetTs = OffsetDateTime.parse("2025-04-04T12:34:56.789+09:00"); + ZonedDateTime zonedTs = ZonedDateTime.parse("2025-04-04T12:34:56.789-05:00[America/New_York]"); + LocalDateTime localTs = LocalDateTime.parse("2025-04-04T12:34:56.789"); + LocalDate localDate = LocalDate.of(2025, 4, 4); + LocalTime localTime = LocalTime.of(12, 34, 56, 789_000_000); + + Map input = Maps.newLinkedHashMap(); + input.put("s", "text"); + input.put("i", 1); + input.put("bool", true); + input.put("d", 2.5); + input.put("n", null); + input.put("hello", ImmutableMap.of("world", 1)); + input.put("tags", ImmutableList.of("a", "b")); + input.put("instant", instant); + input.put("odt", offsetTs); + input.put("zdt", zonedTs); + input.put("ldt", localTs); + input.put("ldate", localDate); + input.put("ltime", localTime); + + Variant variant = variantConverter().convertVariantValue(input); + + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(14); + assertThat(variant.metadata().get(0)).isEqualTo("bool"); + assertThat(variant.metadata().get(1)).isEqualTo("d"); + assertThat(variant.metadata().get(2)).isEqualTo("hello"); + assertThat(variant.metadata().get(3)).isEqualTo("i"); + assertThat(variant.metadata().get(4)).isEqualTo("instant"); + assertThat(variant.metadata().get(5)).isEqualTo("ldate"); + assertThat(variant.metadata().get(6)).isEqualTo("ldt"); + assertThat(variant.metadata().get(7)).isEqualTo("ltime"); + assertThat(variant.metadata().get(8)).isEqualTo("n"); + assertThat(variant.metadata().get(9)).isEqualTo("odt"); + assertThat(variant.metadata().get(10)).isEqualTo("s"); + assertThat(variant.metadata().get(11)).isEqualTo("tags"); + assertThat(variant.metadata().get(12)).isEqualTo("world"); + assertThat(variant.metadata().get(13)).isEqualTo("zdt"); + + assertThat(variant.value().type()).isEqualTo(PhysicalType.OBJECT); + assertThat(variant.value().asObject().numFields()).isEqualTo(13); + + assertThat(variant.value().asObject().get("bool").type()).isEqualTo(PhysicalType.BOOLEAN_TRUE); + assertThat(variant.value().asObject().get("bool").asPrimitive().get()).isEqualTo(true); + + assertThat(variant.value().asObject().get("d").type()).isEqualTo(PhysicalType.DOUBLE); + assertThat(variant.value().asObject().get("d").asPrimitive().get()).isEqualTo(2.5); + + assertThat(variant.value().asObject().get("i").type()).isEqualTo(PhysicalType.INT32); + assertThat(variant.value().asObject().get("i").asPrimitive().get()).isEqualTo(1); + + assertThat(variant.value().asObject().get("n").type()).isEqualTo(PhysicalType.NULL); + + assertThat(variant.value().asObject().get("s").type()).isEqualTo(PhysicalType.STRING); + assertThat(variant.value().asObject().get("s").asPrimitive().get()).isEqualTo("text"); + + VariantValue tags = variant.value().asObject().get("tags"); + assertThat(tags.type()).isEqualTo(PhysicalType.ARRAY); + assertThat(tags.asArray().numElements()).isEqualTo(2); + assertThat(tags.asArray().get(0).asPrimitive().get()).isEqualTo("a"); + assertThat(tags.asArray().get(1).asPrimitive().get()).isEqualTo("b"); + + assertThat(variant.value().asObject().get("instant").type()) + .isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asObject().get("instant").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromInstant(instant)); + + assertThat(variant.value().asObject().get("odt").type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asObject().get("odt").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(offsetTs)); + + assertThat(variant.value().asObject().get("zdt").type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(variant.value().asObject().get("zdt").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamptz(zonedTs.toOffsetDateTime())); + + assertThat(variant.value().asObject().get("ldt").type()).isEqualTo(PhysicalType.TIMESTAMPNTZ); + assertThat(variant.value().asObject().get("ldt").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTimestamp(localTs)); + + assertThat(variant.value().asObject().get("ldate").type()).isEqualTo(PhysicalType.DATE); + assertThat(variant.value().asObject().get("ldate").asPrimitive().get()) + .isEqualTo(DateTimeUtil.daysFromDate(localDate)); + + assertThat(variant.value().asObject().get("ltime").type()).isEqualTo(PhysicalType.TIME); + assertThat(variant.value().asObject().get("ltime").asPrimitive().get()) + .isEqualTo(DateTimeUtil.microsFromTime(localTime)); + + VariantValue nested = variant.value().asObject().get("hello"); + assertThat(nested.type()).isEqualTo(PhysicalType.OBJECT); + assertThat(nested.asObject().get("world").asPrimitive().get()).isEqualTo(1); + } + + @Test + public void testConvertVariantValueFromStruct() { + // Nested Connect struct: primitives, array, and Timestamp / Time / Date (java.util.Date + + // logical types) + // 2025-04-04 12:34:56.789 UTC (aligned with java.time variant tests) + long tsMillis = 1743770096789L; + long timeMillis = 45296789L; + long dateMillis = 20182L * 86_400_000; + + Schema innerSchema = + SchemaBuilder.struct() + .field("i", Schema.INT32_SCHEMA) + .field("str", Schema.STRING_SCHEMA) + .field("tags", SchemaBuilder.array(Schema.STRING_SCHEMA).build()) + .field("ts", Timestamp.SCHEMA) + .field("t", Time.SCHEMA) + .field("d", org.apache.kafka.connect.data.Date.SCHEMA) + .build(); + Schema outerSchema = + SchemaBuilder.struct().field("inner", innerSchema).field("id", Schema.INT64_SCHEMA).build(); + Struct inner = + new Struct(innerSchema) + .put("i", 1) + .put("str", "world") + .put("tags", ImmutableList.of("a", "b")) + .put("ts", new Date(tsMillis)) + .put("t", new Date(timeMillis)) + .put("d", new Date(dateMillis)); + Struct outer = new Struct(outerSchema).put("inner", inner).put("id", 100L); + + Variant variant = variantConverter().convertVariantValue(outer); + + assertThat(variant).isNotNull(); + assertThat(variant.metadata().dictionarySize()).isEqualTo(8); + assertThat(variant.metadata().get(0)).isEqualTo("d"); + assertThat(variant.metadata().get(1)).isEqualTo("i"); + assertThat(variant.metadata().get(2)).isEqualTo("id"); + assertThat(variant.metadata().get(3)).isEqualTo("inner"); + assertThat(variant.metadata().get(4)).isEqualTo("str"); + assertThat(variant.metadata().get(5)).isEqualTo("t"); + assertThat(variant.metadata().get(6)).isEqualTo("tags"); + assertThat(variant.metadata().get(7)).isEqualTo("ts"); + + assertThat(variant.value().type()).isEqualTo(PhysicalType.OBJECT); + assertThat(variant.value().asObject().get("id").asPrimitive().get()).isEqualTo(100L); + + VariantValue innerVal = variant.value().asObject().get("inner"); + assertThat(innerVal.type()).isEqualTo(PhysicalType.OBJECT); + assertThat(innerVal.asObject().get("i").asPrimitive().get()).isEqualTo(1); + assertThat(innerVal.asObject().get("str").asPrimitive().get()).isEqualTo("world"); + assertThat(innerVal.asObject().get("tags").type()).isEqualTo(PhysicalType.ARRAY); + assertThat(innerVal.asObject().get("tags").asArray().numElements()).isEqualTo(2); + assertThat(innerVal.asObject().get("tags").asArray().get(0).asPrimitive().get()).isEqualTo("a"); + assertThat(innerVal.asObject().get("tags").asArray().get(1).asPrimitive().get()).isEqualTo("b"); + + assertThat(innerVal.asObject().get("ts").type()).isEqualTo(PhysicalType.TIMESTAMPTZ); + assertThat(innerVal.asObject().get("ts").asPrimitive().get()).isEqualTo(tsMillis * 1000); + + assertThat(innerVal.asObject().get("t").type()).isEqualTo(PhysicalType.TIME); + assertThat(innerVal.asObject().get("t").asPrimitive().get()).isEqualTo(timeMillis * 1000); + + assertThat(innerVal.asObject().get("d").type()).isEqualTo(PhysicalType.DATE); + assertThat(innerVal.asObject().get("d").asPrimitive().get()).isEqualTo(20182); + } + public static Map createMapData() { return ImmutableMap.builder() .put("i", 1) From c5a09634ef6236b35653c2233e0738a5de920e4f Mon Sep 17 00:00:00 2001 From: Steven Zhen Wu Date: Wed, 22 Apr 2026 21:56:42 -0700 Subject: [PATCH 087/197] REST Spec: Clarify identifier uniqueness across tables and views (#15691) * REST: Clarify that identifiers must be unique across all catalog object types Table and view identifiers share the same namespace scope, so a table and a view with the same name in the same namespace are not allowed. The rename and register-view endpoints already enforced this with "already exists as a table or view", but createTable, registerTable, and createView only guarded against same-type conflicts. This change makes all six write operations consistent by using the new CatalogObjectType schema, which enumerates the known object types (table, view) and states the uniqueness invariant explicitly. The 409 conflict descriptions are updated to: - "The identifier is already used by an existing catalog object (see `CatalogObjectType`)" - "The target identifier to rename to is already used by an existing catalog object (see `CatalogObjectType`)" Made-with: Cursor Model: claude-4.6-sonnet-medium-thinking * REST: Regenerate Python code for CatalogObjectType schema addition Made-with: Cursor Model: claude-4.6-sonnet-medium-thinking * Open API: Remove CatalogObjectType and clarify 409 conflict text Drop the unused CatalogObjectType schema and describe identifier conflicts in terms of existing tables or views. Made-with: Cursor Model: GPT-5.2 * update the error msg in the TableAlreadyExistsError and ViewAlreadyExistsError --- open-api/rest-catalog-open-api.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 2ef154f18f26..2435cd43f0e5 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -607,7 +607,7 @@ paths: NamespaceNotFound: $ref: '#/components/examples/NoSuchNamespaceError' 409: - description: Conflict - The table already exists + description: Conflict - The identifier already exists as a table or view content: application/json: schema: @@ -927,7 +927,7 @@ paths: NamespaceNotFound: $ref: '#/components/examples/NoSuchNamespaceError' 409: - description: Conflict - The table already exists + description: Conflict - The identifier already exists as a table or view content: application/json: schema: @@ -1593,7 +1593,7 @@ paths: NamespaceNotFound: $ref: '#/components/examples/NoSuchNamespaceError' 409: - description: Conflict - The view already exists + description: Conflict - The identifier already exists as a table or view content: application/json: schema: @@ -5225,7 +5225,7 @@ components: summary: The requested table identifier already exists value: { "error": { - "message": "The given table already exists", + "message": "The requested table identifier already exists", "type": "AlreadyExistsException", "code": 409 } @@ -5235,7 +5235,7 @@ components: summary: The requested view identifier already exists value: { "error": { - "message": "The given view already exists", + "message": "The requested view identifier already exists", "type": "AlreadyExistsException", "code": 409 } From 2e153ca04f2ed6cfbb329e6b9af055fa6e27bb73 Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Thu, 23 Apr 2026 09:44:38 -0700 Subject: [PATCH 088/197] Spark 3.4, 3.5, 4.0: Include snapshotId and branch in SparkTable equals and hashCode (#15840) --- .../iceberg/spark/source/SparkTable.java | 17 +++--- .../iceberg/spark/source/TestSparkTable.java | 56 +++++++++++++++++++ .../iceberg/spark/source/SparkTable.java | 17 +++--- .../iceberg/spark/source/TestSparkTable.java | 56 +++++++++++++++++++ .../iceberg/spark/source/SparkTable.java | 17 +++--- .../iceberg/spark/source/TestSparkTable.java | 56 +++++++++++++++++++ 6 files changed, 198 insertions(+), 21 deletions(-) diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 353566eb7f34..871ef9355200 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Objects; import java.util.Set; import org.apache.iceberg.BaseMetadataTable; import org.apache.iceberg.BaseTable; @@ -396,12 +397,13 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); + String writeBranch = branch; if (SparkTableUtil.wapEnabled(table())) { - branch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); + writeBranch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); } - if (branch != null) { - deleteFiles.toBranch(branch); + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { @@ -424,15 +426,16 @@ public boolean equals(Object other) { return false; } - // use only name in order to correctly invalidate Spark cache SparkTable that = (SparkTable) other; - return icebergTable.name().equals(that.icebergTable.name()); + return icebergTable.name().equals(that.icebergTable.name()) + && Objects.equals(table().uuid(), that.table().uuid()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(branch, that.branch); } @Override public int hashCode() { - // use only name in order to correctly invalidate Spark cache - return icebergTable.name().hashCode(); + return Objects.hash(icebergTable.name(), table().uuid(), snapshotId, branch); } private static CaseInsensitiveStringMap addSnapshotId( diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index d14b1a52cf82..e3934faa60ce 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThat; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Table; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.connector.catalog.CatalogManager; @@ -56,4 +58,58 @@ public void testTableEquality() throws NoSuchTableException { assertThat(table1).as("References must be different").isNotSameAs(table2); assertThat(table1).as("Tables must be equivalent").isEqualTo(table2); } + + @TestTemplate + public void testTableInequalityWithDifferentSnapshots() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + sql("INSERT INTO %s VALUES (2, 'b')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + SparkTable table = (SparkTable) catalog.loadTable(identifier); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + long[] snapshotIds = + icebergTable.history().stream().mapToLong(HistoryEntry::snapshotId).toArray(); + + SparkTable tableAtSnapshot1 = table.copyWithSnapshotId(snapshotIds[0]); + SparkTable tableAtSnapshot2 = table.copyWithSnapshotId(snapshotIds[1]); + + assertThat(tableAtSnapshot1) + .as("Tables at different snapshots must not be equal") + .isNotEqualTo(tableAtSnapshot2); + assertThat(tableAtSnapshot1.hashCode()) + .as("Hash codes should differ for different snapshots") + .isNotEqualTo(tableAtSnapshot2.hashCode()); + } + + @TestTemplate + public void testTableInequalityWithDifferentBranches() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + icebergTable + .manageSnapshots() + .createBranch("testBranch", icebergTable.currentSnapshot().snapshotId()) + .commit(); + + // reload after branch creation so the table sees the new ref + SparkTable table = (SparkTable) catalog.loadTable(identifier); + table.table().refresh(); + + SparkTable tableOnMain = table.copyWithBranch("main"); + SparkTable tableOnBranch = table.copyWithBranch("testBranch"); + + assertThat(tableOnMain) + .as("Tables on different branches must not be equal") + .isNotEqualTo(tableOnBranch); + assertThat(tableOnMain.hashCode()) + .as("Hash codes should differ for different branches") + .isNotEqualTo(tableOnBranch.hashCode()); + } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 353566eb7f34..871ef9355200 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Objects; import java.util.Set; import org.apache.iceberg.BaseMetadataTable; import org.apache.iceberg.BaseTable; @@ -396,12 +397,13 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); + String writeBranch = branch; if (SparkTableUtil.wapEnabled(table())) { - branch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); + writeBranch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); } - if (branch != null) { - deleteFiles.toBranch(branch); + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { @@ -424,15 +426,16 @@ public boolean equals(Object other) { return false; } - // use only name in order to correctly invalidate Spark cache SparkTable that = (SparkTable) other; - return icebergTable.name().equals(that.icebergTable.name()); + return icebergTable.name().equals(that.icebergTable.name()) + && Objects.equals(table().uuid(), that.table().uuid()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(branch, that.branch); } @Override public int hashCode() { - // use only name in order to correctly invalidate Spark cache - return icebergTable.name().hashCode(); + return Objects.hash(icebergTable.name(), table().uuid(), snapshotId, branch); } private static CaseInsensitiveStringMap addSnapshotId( diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index d14b1a52cf82..e3934faa60ce 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThat; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Table; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.connector.catalog.CatalogManager; @@ -56,4 +58,58 @@ public void testTableEquality() throws NoSuchTableException { assertThat(table1).as("References must be different").isNotSameAs(table2); assertThat(table1).as("Tables must be equivalent").isEqualTo(table2); } + + @TestTemplate + public void testTableInequalityWithDifferentSnapshots() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + sql("INSERT INTO %s VALUES (2, 'b')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + SparkTable table = (SparkTable) catalog.loadTable(identifier); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + long[] snapshotIds = + icebergTable.history().stream().mapToLong(HistoryEntry::snapshotId).toArray(); + + SparkTable tableAtSnapshot1 = table.copyWithSnapshotId(snapshotIds[0]); + SparkTable tableAtSnapshot2 = table.copyWithSnapshotId(snapshotIds[1]); + + assertThat(tableAtSnapshot1) + .as("Tables at different snapshots must not be equal") + .isNotEqualTo(tableAtSnapshot2); + assertThat(tableAtSnapshot1.hashCode()) + .as("Hash codes should differ for different snapshots") + .isNotEqualTo(tableAtSnapshot2.hashCode()); + } + + @TestTemplate + public void testTableInequalityWithDifferentBranches() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + icebergTable + .manageSnapshots() + .createBranch("testBranch", icebergTable.currentSnapshot().snapshotId()) + .commit(); + + // reload after branch creation so the table sees the new ref + SparkTable table = (SparkTable) catalog.loadTable(identifier); + table.table().refresh(); + + SparkTable tableOnMain = table.copyWithBranch("main"); + SparkTable tableOnBranch = table.copyWithBranch("testBranch"); + + assertThat(tableOnMain) + .as("Tables on different branches must not be equal") + .isNotEqualTo(tableOnBranch); + assertThat(tableOnMain.hashCode()) + .as("Hash codes should differ for different branches") + .isNotEqualTo(tableOnBranch.hashCode()); + } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 1ee9e9b08074..6f0f992f1c20 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Objects; import java.util.Set; import org.apache.iceberg.BaseMetadataTable; import org.apache.iceberg.BaseTable; @@ -438,12 +439,13 @@ public void deleteWhere(Predicate[] predicates) { .set("spark.app.id", sparkSession().sparkContext().applicationId()) .deleteFromRowFilter(deleteExpr); + String writeBranch = branch; if (SparkTableUtil.wapEnabled(table())) { - branch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); + writeBranch = SparkTableUtil.determineWriteBranch(sparkSession(), branch); } - if (branch != null) { - deleteFiles.toBranch(branch); + if (writeBranch != null) { + deleteFiles.toBranch(writeBranch); } if (!CommitMetadata.commitProperties().isEmpty()) { @@ -466,15 +468,16 @@ public boolean equals(Object other) { return false; } - // use only name in order to correctly invalidate Spark cache SparkTable that = (SparkTable) other; - return icebergTable.name().equals(that.icebergTable.name()); + return icebergTable.name().equals(that.icebergTable.name()) + && Objects.equals(table().uuid(), that.table().uuid()) + && Objects.equals(snapshotId, that.snapshotId) + && Objects.equals(branch, that.branch); } @Override public int hashCode() { - // use only name in order to correctly invalidate Spark cache - return icebergTable.name().hashCode(); + return Objects.hash(icebergTable.name(), table().uuid(), snapshotId, branch); } private static CaseInsensitiveStringMap addSnapshotId( diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java index d14b1a52cf82..e3934faa60ce 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkTable.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThat; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Table; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.connector.catalog.CatalogManager; @@ -56,4 +58,58 @@ public void testTableEquality() throws NoSuchTableException { assertThat(table1).as("References must be different").isNotSameAs(table2); assertThat(table1).as("Tables must be equivalent").isEqualTo(table2); } + + @TestTemplate + public void testTableInequalityWithDifferentSnapshots() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + sql("INSERT INTO %s VALUES (2, 'b')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + SparkTable table = (SparkTable) catalog.loadTable(identifier); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + long[] snapshotIds = + icebergTable.history().stream().mapToLong(HistoryEntry::snapshotId).toArray(); + + SparkTable tableAtSnapshot1 = table.copyWithSnapshotId(snapshotIds[0]); + SparkTable tableAtSnapshot2 = table.copyWithSnapshotId(snapshotIds[1]); + + assertThat(tableAtSnapshot1) + .as("Tables at different snapshots must not be equal") + .isNotEqualTo(tableAtSnapshot2); + assertThat(tableAtSnapshot1.hashCode()) + .as("Hash codes should differ for different snapshots") + .isNotEqualTo(tableAtSnapshot2.hashCode()); + } + + @TestTemplate + public void testTableInequalityWithDifferentBranches() throws NoSuchTableException { + sql("INSERT INTO %s VALUES (1, 'a')", tableName); + + CatalogManager catalogManager = spark.sessionState().catalogManager(); + TableCatalog catalog = (TableCatalog) catalogManager.catalog(catalogName); + Identifier identifier = Identifier.of(tableIdent.namespace().levels(), tableIdent.name()); + + Table icebergTable = validationCatalog.loadTable(tableIdent); + icebergTable + .manageSnapshots() + .createBranch("testBranch", icebergTable.currentSnapshot().snapshotId()) + .commit(); + + // reload after branch creation so the table sees the new ref + SparkTable table = (SparkTable) catalog.loadTable(identifier); + table.table().refresh(); + + SparkTable tableOnMain = table.copyWithBranch("main"); + SparkTable tableOnBranch = table.copyWithBranch("testBranch"); + + assertThat(tableOnMain) + .as("Tables on different branches must not be equal") + .isNotEqualTo(tableOnBranch); + assertThat(tableOnMain.hashCode()) + .as("Hash codes should differ for different branches") + .isNotEqualTo(tableOnBranch.hashCode()); + } } From a4386b96971e306008d013e079d62dfef53ddcce Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Thu, 23 Apr 2026 19:33:09 +0200 Subject: [PATCH 089/197] Core, Spark: Verify that TRUNCATE removes orphaned DVs (#16078) --- .../org/apache/iceberg/TestDeleteFiles.java | 71 +++++++++++++++++++ .../iceberg/spark/sql/TestDeleteFrom.java | 38 ++++++++++ 2 files changed, 109 insertions(+) diff --git a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java index 68e5fa8b560e..d7cdd5c5d884 100644 --- a/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java +++ b/core/src/test/java/org/apache/iceberg/TestDeleteFiles.java @@ -615,6 +615,10 @@ public void removingDataFileByExpressionAlsoRemovesDV() { .containsEntry(SnapshotSummary.REPLACED_MANIFESTS_COUNT, "2"); assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); + assertThat(deleteSnap.summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "1"); + validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -658,6 +662,10 @@ public void removingDataFileByPathAlsoRemovesDV() { .containsEntry(SnapshotSummary.REPLACED_MANIFESTS_COUNT, "2"); assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); + assertThat(deleteSnap.summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "1"); + validateDeleteManifest( deleteSnap.deleteManifests(table.io()).get(0), dataSeqs(1L, 1L), @@ -667,6 +675,69 @@ public void removingDataFileByPathAlsoRemovesDV() { statuses(ManifestEntry.Status.DELETED, ManifestEntry.Status.EXISTING)); } + @TestTemplate + public void removingDataFilesWhenTruncatingAlsoRemovesDVs() { + assumeThat(formatVersion).isGreaterThanOrEqualTo(3); + DeleteFile dv1 = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-1-deletes.puffin") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withRecordCount(5) + .withReferencedDataFile(DATA_FILE_BUCKET_0_IDS_0_2.location()) + .withContentOffset(4) + .withContentSizeInBytes(6) + .build(); + + DeleteFile dv2 = + FileMetadata.deleteFileBuilder(SPEC) + .ofPositionDeletes() + .withPath("/path/to/data-2-deletes.puffin") + .withFileSizeInBytes(10) + .withPartitionPath("data_bucket=0") + .withRecordCount(5) + .withReferencedDataFile(DATA_FILE_BUCKET_0_IDS_8_10.location()) + .withContentOffset(4) + .withContentSizeInBytes(6) + .build(); + + commit( + table, + table + .newRowDelta() + .addRows(DATA_FILE_BUCKET_0_IDS_0_2) + .addRows(DATA_FILE_BUCKET_0_IDS_8_10) + .addDeletes(dv1) + .addDeletes(dv2), + branch); + + Snapshot snapshot = latestSnapshot(table, branch); + assertThat(snapshot.sequenceNumber()).isEqualTo(1); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(1); + + // deleting by row filter should also remove the orphaned dv1 from delete manifests. When a + // table is truncated via TRUNCATE, the row filter is sent as Expressions.alwaysTrue() + commit(table, table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()), branch); + + Snapshot deleteSnap = latestSnapshot(table, branch); + assertThat(deleteSnap.sequenceNumber()).isEqualTo(2); + assertThat(table.ops().current().lastSequenceNumber()).isEqualTo(2); + + assertThat(deleteSnap.deleteManifests(table.io())).hasSize(1); + assertThat(deleteSnap.summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "2") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "2"); + + validateDeleteManifest( + deleteSnap.deleteManifests(table.io()).get(0), + dataSeqs(1L, 1L), + fileSeqs(1L, 1L), + ids(deleteSnap.snapshotId(), deleteSnap.snapshotId()), + files(dv1, dv2), + statuses(Status.DELETED, Status.DELETED)); + } + private static ByteBuffer longToBuffer(long value) { return ByteBuffer.allocate(8).order(ByteOrder.LITTLE_ENDIAN).putLong(0, value); } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java index 536d568003cf..02c5ecd66b80 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java @@ -23,6 +23,7 @@ import java.util.List; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -185,4 +186,41 @@ public void testDeleteFromTablePartitionedByVarbinary() { ImmutableList.of(row(1L, new byte[] {-29, -68, -47})), sql("SELECT * FROM %s where data = X'e3bcd1'", tableName)); } + + @TestTemplate + public void truncateWithDVs() throws NoSuchTableException { + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg TBLPROPERTIES ('format-version'='3','write.delete.mode'='merge-on-read')", + tableName); + List records = + ImmutableList.of( + new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); + Dataset df = spark.createDataFrame(records, SimpleRecord.class); + df.coalesce(1).writeTo(tableName).append(); + + assertThat(sql("SELECT * FROM %s ORDER BY id", tableName)) + .containsExactly(row(1L, "a"), row(2L, "b"), row(3L, "c")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + assertThat(validationCatalog.loadTable(tableIdent).currentSnapshot().summary()) + .containsEntry(SnapshotSummary.ADDED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.ADDED_POS_DELETES_PROP, "1"); + + sql("DELETE FROM %s WHERE id = 2", tableName); + // DVs have been merged into single file + assertThat(validationCatalog.loadTable(tableIdent).currentSnapshot().summary()) + .containsEntry(SnapshotSummary.ADDED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.ADDED_POS_DELETES_PROP, "2"); + + assertThat(sql("SELECT * FROM %s ORDER BY id", tableName)).containsExactly(row(3L, "c")); + + sql("TRUNCATE TABLE %s", tableName); + assertThat(validationCatalog.loadTable(tableIdent).currentSnapshot().summary()) + .containsEntry(SnapshotSummary.REMOVED_DVS_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_DELETE_FILES_PROP, "1") + .containsEntry(SnapshotSummary.REMOVED_POS_DELETES_PROP, "2"); + + assertThat(sql("SELECT * FROM %s ORDER BY id", tableName)).isEmpty(); + } } From 1d5463f687db2a9a80e1a35ac09f36b7e4e8cbeb Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Thu, 23 Apr 2026 14:05:26 -0700 Subject: [PATCH 090/197] API: Implement notStartsWith bounds check in StrictMetricsEvaluator (#15883) --- .../expressions/StrictMetricsEvaluator.java | 34 +++- .../TestStrictMetricsEvaluator.java | 150 +++++++++++++++++- 2 files changed, 180 insertions(+), 4 deletions(-) diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index c225f21da8a8..5d981e7ed139 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -22,6 +22,7 @@ import java.nio.ByteBuffer; import java.util.Collection; +import java.util.Comparator; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -29,6 +30,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.ExpressionVisitors.BoundExpressionVisitor; +import org.apache.iceberg.types.Comparators; import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Types.StructType; import org.apache.iceberg.util.NaNUtil; @@ -467,8 +469,36 @@ public Boolean startsWith(BoundReference ref, Literal lit) { @Override public Boolean notStartsWith(BoundReference ref, Literal lit) { - // TODO: Handle cases that definitely cannot match, such as notStartsWith("x") when the bounds - // are ["a", "b"]. + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (containsNullsOnly(id)) { + return ROWS_MUST_MATCH; + } + + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + + if (lowerBounds != null && lowerBounds.containsKey(id)) { + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + // truncate lower bound so that its length is not greater than the length of prefix + int length = Math.min(prefix.length(), lower.length()); + if (comparator.compare(lower.subSequence(0, length), prefix) > 0) { + return ROWS_MUST_MATCH; + } + } + + if (upperBounds != null && upperBounds.containsKey(id)) { + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + // truncate upper bound so that its length is not greater than the length of prefix + int length = Math.min(prefix.length(), upper.length()); + if (comparator.compare(upper.subSequence(0, length), prefix) < 0) { + return ROWS_MUST_MATCH; + } + } + return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index f34cd730df77..99800f5171ba 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -32,6 +32,7 @@ import static org.apache.iceberg.expressions.Expressions.notIn; import static org.apache.iceberg.expressions.Expressions.notNaN; import static org.apache.iceberg.expressions.Expressions.notNull; +import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; @@ -72,8 +73,8 @@ public class TestStrictMetricsEvaluator { "struct", Types.StructType.of( Types.NestedField.optional(16, "nested_col_no_stats", Types.IntegerType.get()), - Types.NestedField.optional( - 17, "nested_col_with_stats", Types.IntegerType.get())))); + Types.NestedField.optional(17, "nested_col_with_stats", Types.IntegerType.get()), + Types.NestedField.optional(18, "nested_string_col", Types.StringType.get())))); private static final int INT_MIN_VALUE = 30; private static final int INT_MAX_VALUE = 79; @@ -172,6 +173,40 @@ public class TestStrictMetricsEvaluator { // upper bounds ImmutableMap.of(5, toByteBuffer(StringType.get(), "bbb"))); + // String-focused file: required column 3 has no nulls and string bounds ["abc", "abd"] + private static final DataFile STRING_FILE = + new TestDataFile( + "string_file.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abc")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "abd"))); + + // String file with wider range: required column 3 has no nulls and bounds ["aa", "dC"] + private static final DataFile STRING_FILE_2 = + new TestDataFile( + "string_file_2.avro", + Row.of(), + 50, + // any value counts, including nulls + ImmutableMap.of(3, 50L), + // null value counts + ImmutableMap.of(), + // nan value counts + null, + // lower bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "aa")), + // upper bounds + ImmutableMap.of(3, toByteBuffer(StringType.get(), "dC"))); + @Test public void testAllNulls() { boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, notNull("all_nulls")).eval(FILE); @@ -684,4 +719,115 @@ SCHEMA, lessThanOrEqual("struct.nested_col_with_stats", INT_MAX_VALUE)) new StrictMetricsEvaluator(SCHEMA, notNull("struct.nested_col_with_stats")).eval(FILE); assertThat(shouldRead).as("notNull nested column should not match").isFalse(); } + + @Test + public void testNotStartsWithAllNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead).as("Should match: all null values satisfy notStartsWith").isTrue(); + } + + @Test + public void testNotStartsWithBoundsAbovePrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsBelowPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the prefix range").isTrue(); + } + + @Test + public void testNotStartsWithBoundsOverlapPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: bounds overlap the prefix range").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + } + + @Test + public void testNotStartsWithWiderRange() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: lower bound starts with the prefix").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "c")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: prefix is within the bounds range").isFalse(); + } + + @Test + public void testNotStartsWithNoStats() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + + @Test + public void testNotStartsWithSomeNullsBoundsOutsidePrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_2); + assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue(); + } + + @Test + public void testNotStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "aaaaaaa")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are above the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "zzzzzzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all values are below the long prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix overlaps with bound range").isFalse(); + } + + @Test + void testNotStartsWithEmptyPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("required", "")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: all strings start with empty prefix").isFalse(); + } + + @Test + void testNotStartsWithExactBoundMatch() { + // FILE_3 has column 5 (some_nulls) with exact bounds ["bbb", "bbb"] + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "bbb")).eval(FILE_3); + assertThat(shouldRead).as("Should not match: bounds exactly equal the prefix").isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "zzz")).eval(FILE_3); + assertThat(shouldRead).as("Should match: all values are below the prefix").isTrue(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("some_nulls", "aaa")).eval(FILE_3); + assertThat(shouldRead).as("Should match: all values are above the prefix").isTrue(); + } + + @Test + public void testNotStartsWithNestedColumn() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, notStartsWith("struct.nested_string_col", "a")) + .eval(FILE); + assertThat(shouldRead).as("notStartsWith nested column should not match").isFalse(); + } } From 52092cd1109cba1e2f0c87959775e5f883a2cfd0 Mon Sep 17 00:00:00 2001 From: Anoop Johnson Date: Thu, 23 Apr 2026 14:17:44 -0700 Subject: [PATCH 091/197] Core: Add implementations of v4 TrackedFile interfaces (#15854) --- .../java/org/apache/iceberg/FileContent.java | 8 + .../org/apache/iceberg/DeletionVector.java | 3 + .../apache/iceberg/DeletionVectorStruct.java | 127 ++++++ .../java/org/apache/iceberg/ManifestInfo.java | 3 + .../apache/iceberg/ManifestInfoStruct.java | 227 +++++++++++ .../java/org/apache/iceberg/TrackedFile.java | 6 - .../org/apache/iceberg/TrackedFileStruct.java | 328 +++++++++++++++ .../java/org/apache/iceberg/Tracking.java | 9 + .../org/apache/iceberg/TrackingStruct.java | 241 +++++++++++ .../iceberg/TestDeletionVectorStruct.java | 118 ++++++ .../iceberg/TestManifestInfoStruct.java | 189 +++++++++ .../apache/iceberg/TestTrackedFileStruct.java | 376 ++++++++++++++++++ .../apache/iceberg/TestTrackingStruct.java | 220 ++++++++++ 13 files changed, 1849 insertions(+), 6 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java create mode 100644 core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java create mode 100644 core/src/main/java/org/apache/iceberg/TrackedFileStruct.java create mode 100644 core/src/main/java/org/apache/iceberg/TrackingStruct.java create mode 100644 core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java create mode 100644 core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java create mode 100644 core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java create mode 100644 core/src/test/java/org/apache/iceberg/TestTrackingStruct.java diff --git a/api/src/main/java/org/apache/iceberg/FileContent.java b/api/src/main/java/org/apache/iceberg/FileContent.java index 1ee1d290b767..9d5ab8ceeec9 100644 --- a/api/src/main/java/org/apache/iceberg/FileContent.java +++ b/api/src/main/java/org/apache/iceberg/FileContent.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg; +import java.util.Locale; + /** Content type stored in a file. */ public enum FileContent { DATA(0), @@ -29,15 +31,21 @@ public enum FileContent { private static final FileContent[] VALUES = FileContent.values(); private final int id; + private final String lowerCaseName; FileContent(int id) { this.id = id; + this.lowerCaseName = name().toLowerCase(Locale.ROOT); } public int id() { return id; } + public String lowerCaseName() { + return lowerCaseName; + } + public static FileContent fromId(int id) { return VALUES[id]; } diff --git a/core/src/main/java/org/apache/iceberg/DeletionVector.java b/core/src/main/java/org/apache/iceberg/DeletionVector.java index 55bd38dc97be..0fc8f259f075 100644 --- a/core/src/main/java/org/apache/iceberg/DeletionVector.java +++ b/core/src/main/java/org/apache/iceberg/DeletionVector.java @@ -61,4 +61,7 @@ static Types.StructType schema() { /** Returns the number of set bits (deleted rows) in the vector. */ long cardinality(); + + /** Copies this deletion vector. */ + DeletionVector copy(); } diff --git a/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java b/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java new file mode 100644 index 000000000000..389036ce237b --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.types.Types; + +/** Mutable {@link StructLike} implementation of {@link DeletionVector}. */ +class DeletionVectorStruct extends SupportsIndexProjection implements DeletionVector, Serializable { + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + DeletionVector.LOCATION, + DeletionVector.OFFSET, + DeletionVector.SIZE_IN_BYTES, + DeletionVector.CARDINALITY); + + private String location = null; + private long offset = -1L; + private long sizeInBytes = -1L; + private long cardinality = -1L; + + DeletionVectorStruct(Types.StructType type) { + super(BASE_TYPE, type); + } + + private DeletionVectorStruct(DeletionVectorStruct toCopy) { + super(toCopy); + this.location = toCopy.location; + this.offset = toCopy.offset; + this.sizeInBytes = toCopy.sizeInBytes; + this.cardinality = toCopy.cardinality; + } + + @Override + public String location() { + return location; + } + + @Override + public long offset() { + return offset; + } + + @Override + public long sizeInBytes() { + return sizeInBytes; + } + + @Override + public long cardinality() { + return cardinality; + } + + @Override + public DeletionVectorStruct copy() { + return new DeletionVectorStruct(this); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return location; + case 1: + return offset; + case 2: + return sizeInBytes; + case 3: + return cardinality; + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + // always coerce to String for Serializable + this.location = value.toString(); + break; + case 1: + this.offset = (Long) value; + break; + case 2: + this.sizeInBytes = (Long) value; + break; + case 3: + this.cardinality = (Long) value; + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("location", location) + .add("offset", offset) + .add("size_in_bytes", sizeInBytes) + .add("cardinality", cardinality) + .toString(); + } +} diff --git a/core/src/main/java/org/apache/iceberg/ManifestInfo.java b/core/src/main/java/org/apache/iceberg/ManifestInfo.java index a4651b0eadb0..e87287911426 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestInfo.java +++ b/core/src/main/java/org/apache/iceberg/ManifestInfo.java @@ -110,4 +110,7 @@ static Types.StructType schema() { /** Returns the number of entries marked as deleted in the DV, or null if not present. */ Long dvCardinality(); + + /** Copies this manifest info. */ + ManifestInfo copy(); } diff --git a/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java b/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java new file mode 100644 index 000000000000..8f51df749e33 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Arrays; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; + +/** Mutable {@link StructLike} implementation of {@link ManifestInfo}. */ +class ManifestInfoStruct extends SupportsIndexProjection implements ManifestInfo, Serializable { + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + ManifestInfo.ADDED_FILES_COUNT, + ManifestInfo.EXISTING_FILES_COUNT, + ManifestInfo.DELETED_FILES_COUNT, + ManifestInfo.REPLACED_FILES_COUNT, + ManifestInfo.ADDED_ROWS_COUNT, + ManifestInfo.EXISTING_ROWS_COUNT, + ManifestInfo.DELETED_ROWS_COUNT, + ManifestInfo.REPLACED_ROWS_COUNT, + ManifestInfo.MIN_SEQUENCE_NUMBER, + ManifestInfo.DV, + ManifestInfo.DV_CARDINALITY); + + private int addedFilesCount = -1; + private int existingFilesCount = -1; + private int deletedFilesCount = -1; + private int replacedFilesCount = -1; + private long addedRowsCount = -1L; + private long existingRowsCount = -1L; + private long deletedRowsCount = -1L; + private long replacedRowsCount = -1L; + private long minSequenceNumber = -1L; + private byte[] dv = null; + private Long dvCardinality = null; + + ManifestInfoStruct(Types.StructType type) { + super(BASE_TYPE, type); + } + + private ManifestInfoStruct(ManifestInfoStruct toCopy) { + super(toCopy); + this.addedFilesCount = toCopy.addedFilesCount; + this.existingFilesCount = toCopy.existingFilesCount; + this.deletedFilesCount = toCopy.deletedFilesCount; + this.replacedFilesCount = toCopy.replacedFilesCount; + this.addedRowsCount = toCopy.addedRowsCount; + this.existingRowsCount = toCopy.existingRowsCount; + this.deletedRowsCount = toCopy.deletedRowsCount; + this.replacedRowsCount = toCopy.replacedRowsCount; + this.minSequenceNumber = toCopy.minSequenceNumber; + this.dv = toCopy.dv != null ? Arrays.copyOf(toCopy.dv, toCopy.dv.length) : null; + this.dvCardinality = toCopy.dvCardinality; + } + + @Override + public int addedFilesCount() { + return addedFilesCount; + } + + @Override + public int existingFilesCount() { + return existingFilesCount; + } + + @Override + public int deletedFilesCount() { + return deletedFilesCount; + } + + @Override + public int replacedFilesCount() { + return replacedFilesCount; + } + + @Override + public long addedRowsCount() { + return addedRowsCount; + } + + @Override + public long existingRowsCount() { + return existingRowsCount; + } + + @Override + public long deletedRowsCount() { + return deletedRowsCount; + } + + @Override + public long replacedRowsCount() { + return replacedRowsCount; + } + + @Override + public long minSequenceNumber() { + return minSequenceNumber; + } + + @Override + public ByteBuffer dv() { + return dv != null ? ByteBuffer.wrap(dv) : null; + } + + @Override + public Long dvCardinality() { + return dvCardinality; + } + + @Override + public ManifestInfoStruct copy() { + return new ManifestInfoStruct(this); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return addedFilesCount; + case 1: + return existingFilesCount; + case 2: + return deletedFilesCount; + case 3: + return replacedFilesCount; + case 4: + return addedRowsCount; + case 5: + return existingRowsCount; + case 6: + return deletedRowsCount; + case 7: + return replacedRowsCount; + case 8: + return minSequenceNumber; + case 9: + return dv(); + case 10: + return dvCardinality; + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + this.addedFilesCount = (Integer) value; + break; + case 1: + this.existingFilesCount = (Integer) value; + break; + case 2: + this.deletedFilesCount = (Integer) value; + break; + case 3: + this.replacedFilesCount = (Integer) value; + break; + case 4: + this.addedRowsCount = (Long) value; + break; + case 5: + this.existingRowsCount = (Long) value; + break; + case 6: + this.deletedRowsCount = (Long) value; + break; + case 7: + this.replacedRowsCount = (Long) value; + break; + case 8: + this.minSequenceNumber = (Long) value; + break; + case 9: + this.dv = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 10: + this.dvCardinality = (Long) value; + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("added_files_count", addedFilesCount) + .add("existing_files_count", existingFilesCount) + .add("deleted_files_count", deletedFilesCount) + .add("replaced_files_count", replacedFilesCount) + .add("added_rows_count", addedRowsCount) + .add("existing_rows_count", existingRowsCount) + .add("deleted_rows_count", deletedRowsCount) + .add("replaced_rows_count", replacedRowsCount) + .add("min_sequence_number", minSequenceNumber) + .add("dv", dv == null ? "null" : "(binary)") + .add("dv_cardinality", dvCardinality == null ? "null" : dvCardinality) + .toString(); + } +} diff --git a/core/src/main/java/org/apache/iceberg/TrackedFile.java b/core/src/main/java/org/apache/iceberg/TrackedFile.java index d15f9e582cb6..d9ae100ac651 100644 --- a/core/src/main/java/org/apache/iceberg/TrackedFile.java +++ b/core/src/main/java/org/apache/iceberg/TrackedFile.java @@ -163,10 +163,4 @@ static Types.StructType schemaWithContentStats(Types.StructType contentStatsType default TrackedFile copyWithoutStats() { return copyWithStats(Collections.emptySet()); } - - /** Returns the manifest location this entry was read from, or null. */ - String manifestLocation(); - - /** Returns the ordinal position of this entry within the manifest. */ - long manifestPos(); } diff --git a/core/src/main/java/org/apache/iceberg/TrackedFileStruct.java b/core/src/main/java/org/apache/iceberg/TrackedFileStruct.java new file mode 100644 index 000000000000..ba9fd362038a --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackedFileStruct.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ArrayUtil; +import org.apache.iceberg.util.ByteBuffers; + +/** Mutable {@link StructLike} implementation of {@link TrackedFile}. */ +class TrackedFileStruct extends SupportsIndexProjection implements TrackedFile, Serializable { + private static final Types.StructType EMPTY_STRUCT_TYPE = Types.StructType.of(); + + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + TrackedFile.TRACKING, + TrackedFile.CONTENT_TYPE, + TrackedFile.LOCATION, + TrackedFile.FILE_FORMAT, + TrackedFile.RECORD_COUNT, + TrackedFile.FILE_SIZE_IN_BYTES, + TrackedFile.SPEC_ID, + Types.NestedField.optional( + TrackedFile.CONTENT_STATS_ID, + TrackedFile.CONTENT_STATS_NAME, + EMPTY_STRUCT_TYPE, + TrackedFile.CONTENT_STATS_DOC), + TrackedFile.SORT_ORDER_ID, + TrackedFile.DELETION_VECTOR, + TrackedFile.MANIFEST_INFO, + TrackedFile.KEY_METADATA, + TrackedFile.SPLIT_OFFSETS, + TrackedFile.EQUALITY_IDS); + + private FileContent contentType = null; + private String location = null; + private FileFormat fileFormat = null; + private long recordCount = -1L; + private long fileSizeInBytes = -1L; + private Integer specId = null; + + // optional fields + private Tracking tracking = null; + private ContentStats contentStats = null; + private Integer sortOrderId = null; + private DeletionVector deletionVector = null; + private ManifestInfo manifestInfo = null; + private byte[] keyMetadata = null; + private long[] splitOffsets = null; + private int[] equalityIds = null; + + /** Used by internal readers to instantiate this class with a projection schema. */ + TrackedFileStruct(Types.StructType projection) { + super(BASE_TYPE, projection); + } + + /** No-projection constructor for direct construction. */ + TrackedFileStruct() { + super(BASE_TYPE.fields().size()); + } + + /** Constructor that accepts required fields. */ + TrackedFileStruct( + Tracking tracking, + FileContent contentType, + String location, + FileFormat fileFormat, + long recordCount, + long fileSizeInBytes) { + super(BASE_TYPE.fields().size()); + this.tracking = tracking; + this.contentType = contentType; + this.location = location; + this.fileFormat = fileFormat; + this.recordCount = recordCount; + this.fileSizeInBytes = fileSizeInBytes; + } + + /** Copy constructor. */ + private TrackedFileStruct(TrackedFileStruct toCopy, boolean withStats, Set statsIds) { + super(toCopy); + this.contentType = toCopy.contentType; + this.location = toCopy.location; + this.fileFormat = toCopy.fileFormat; + this.recordCount = toCopy.recordCount; + this.fileSizeInBytes = toCopy.fileSizeInBytes; + this.specId = toCopy.specId; + + this.tracking = toCopy.tracking != null ? toCopy.tracking.copy() : null; + + this.sortOrderId = toCopy.sortOrderId; + this.deletionVector = toCopy.deletionVector != null ? toCopy.deletionVector.copy() : null; + + if (withStats && toCopy.contentStats != null) { + ContentStats filtered = BaseContentStats.buildFrom(toCopy.contentStats, statsIds).build(); + this.contentStats = filtered.fieldStats().isEmpty() ? null : filtered; + } else { + this.contentStats = null; + } + + this.manifestInfo = toCopy.manifestInfo != null ? toCopy.manifestInfo.copy() : null; + this.keyMetadata = + toCopy.keyMetadata != null + ? Arrays.copyOf(toCopy.keyMetadata, toCopy.keyMetadata.length) + : null; + this.splitOffsets = + toCopy.splitOffsets != null + ? Arrays.copyOf(toCopy.splitOffsets, toCopy.splitOffsets.length) + : null; + this.equalityIds = + toCopy.equalityIds != null + ? Arrays.copyOf(toCopy.equalityIds, toCopy.equalityIds.length) + : null; + } + + @Override + public Tracking tracking() { + return tracking; + } + + @Override + public FileContent contentType() { + return contentType; + } + + @Override + public String location() { + return location; + } + + @Override + public FileFormat fileFormat() { + return fileFormat; + } + + @Override + public long recordCount() { + return recordCount; + } + + @Override + public long fileSizeInBytes() { + return fileSizeInBytes; + } + + @Override + public Integer specId() { + return specId; + } + + @Override + public ContentStats contentStats() { + return contentStats; + } + + @Override + public Integer sortOrderId() { + return sortOrderId; + } + + @Override + public DeletionVector deletionVector() { + return deletionVector; + } + + @Override + public ManifestInfo manifestInfo() { + return manifestInfo; + } + + @Override + public ByteBuffer keyMetadata() { + return keyMetadata != null ? ByteBuffer.wrap(keyMetadata) : null; + } + + @Override + public List splitOffsets() { + return splitOffsets != null ? ArrayUtil.toUnmodifiableLongList(splitOffsets) : null; + } + + @Override + public List equalityIds() { + return equalityIds != null ? ArrayUtil.toUnmodifiableIntList(equalityIds) : null; + } + + @Override + public TrackedFile copy() { + return new TrackedFileStruct(this, true, null); + } + + @Override + public TrackedFile copyWithStats(Set requestedColumnIds) { + return new TrackedFileStruct(this, true, requestedColumnIds); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return tracking; + case 1: + return contentType != null ? contentType.id() : null; + case 2: + return location; + case 3: + return fileFormat != null ? fileFormat.toString() : null; + case 4: + return recordCount; + case 5: + return fileSizeInBytes; + case 6: + return specId; + case 7: + return contentStats; + case 8: + return sortOrderId; + case 9: + return deletionVector; + case 10: + return manifestInfo; + case 11: + return keyMetadata(); + case 12: + return splitOffsets(); + case 13: + return equalityIds(); + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + this.tracking = (Tracking) value; + break; + case 1: + this.contentType = FileContent.fromId((Integer) value); + break; + case 2: + // always coerce to String for Serializable + this.location = value.toString(); + break; + case 3: + this.fileFormat = FileFormat.fromString(value.toString()); + break; + case 4: + this.recordCount = (Long) value; + break; + case 5: + this.fileSizeInBytes = (Long) value; + break; + case 6: + this.specId = (Integer) value; + break; + case 7: + this.contentStats = (ContentStats) value; + break; + case 8: + this.sortOrderId = (Integer) value; + break; + case 9: + this.deletionVector = (DeletionVector) value; + break; + case 10: + this.manifestInfo = (ManifestInfo) value; + break; + case 11: + this.keyMetadata = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 12: + this.splitOffsets = ArrayUtil.toLongArray((List) value); + break; + case 13: + this.equalityIds = ArrayUtil.toIntArray((List) value); + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("content", contentType != null ? contentType.lowerCaseName() : null) + .add("location", location) + .add("file_format", fileFormat) + .add("record_count", recordCount) + .add("file_size_in_bytes", fileSizeInBytes) + .add("spec_id", specId()) + .add("tracking", tracking) + .add("content_stats", contentStats) + .add("sort_order_id", sortOrderId) + .add("deletion_vector", deletionVector) + .add("manifest_info", manifestInfo) + .add("key_metadata", keyMetadata == null ? "null" : "(redacted)") + .add("split_offsets", splitOffsets == null ? "null" : splitOffsets()) + .add("equality_ids", equalityIds == null ? "null" : equalityIds()) + .toString(); + } +} diff --git a/core/src/main/java/org/apache/iceberg/Tracking.java b/core/src/main/java/org/apache/iceberg/Tracking.java index 46b14e549a35..8003ed82ea9c 100644 --- a/core/src/main/java/org/apache/iceberg/Tracking.java +++ b/core/src/main/java/org/apache/iceberg/Tracking.java @@ -106,4 +106,13 @@ default boolean isLive() { /** Returns the bitmap of positions replaced in this snapshot. */ ByteBuffer replacedPositions(); + + /** Returns the manifest location this entry was read from, or null. */ + String manifestLocation(); + + /** Returns the ordinal position of this entry within the manifest. */ + long manifestPos(); + + /** Copies this tracking information. */ + Tracking copy(); } diff --git a/core/src/main/java/org/apache/iceberg/TrackingStruct.java b/core/src/main/java/org/apache/iceberg/TrackingStruct.java new file mode 100644 index 000000000000..05f51360825b --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/TrackingStruct.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.util.Arrays; +import org.apache.iceberg.avro.SupportsIndexProjection; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.ByteBuffers; + +/** Mutable {@link StructLike} implementation of {@link Tracking}. */ +class TrackingStruct extends SupportsIndexProjection implements Tracking, Serializable { + private static final Types.StructType BASE_TYPE = + Types.StructType.of( + Tracking.STATUS, + Tracking.SNAPSHOT_ID, + Tracking.SEQUENCE_NUMBER, + Tracking.FILE_SEQUENCE_NUMBER, + Tracking.DV_SNAPSHOT_ID, + Tracking.FIRST_ROW_ID, + Tracking.DELETED_POSITIONS, + Tracking.REPLACED_POSITIONS, + MetadataColumns.ROW_POSITION); + + private EntryStatus status = null; + private Long snapshotId = null; + private Long dataSequenceNumber = null; + private Long fileSequenceNumber = null; + private Long dvSnapshotId = null; + private Long firstRowId = null; + private byte[] deletedPositions = null; + private byte[] replacedPositions = null; + + // set by manifest readers, not written to manifests + private String manifestLocation = null; + private long manifestPos = -1L; + + TrackingStruct(Types.StructType type) { + super(BASE_TYPE, type); + } + + TrackingStruct() { + super(BASE_TYPE.fields().size()); + } + + private TrackingStruct(TrackingStruct toCopy) { + super(toCopy); + this.status = toCopy.status; + this.snapshotId = toCopy.snapshotId; + this.dataSequenceNumber = toCopy.dataSequenceNumber; + this.fileSequenceNumber = toCopy.fileSequenceNumber; + this.dvSnapshotId = toCopy.dvSnapshotId; + this.firstRowId = toCopy.firstRowId; + this.deletedPositions = + toCopy.deletedPositions != null + ? Arrays.copyOf(toCopy.deletedPositions, toCopy.deletedPositions.length) + : null; + this.replacedPositions = + toCopy.replacedPositions != null + ? Arrays.copyOf(toCopy.replacedPositions, toCopy.replacedPositions.length) + : null; + this.manifestLocation = toCopy.manifestLocation; + this.manifestPos = toCopy.manifestPos; + } + + void inheritFrom(Tracking manifestTracking) { + if (manifestTracking != null) { + if (snapshotId == null) { + this.snapshotId = manifestTracking.snapshotId(); + } + + // both sequence numbers inherit from file sequence number because manifests + // do not distinguish between data and file sequence numbers + if (status == EntryStatus.ADDED) { + if (dataSequenceNumber == null) { + this.dataSequenceNumber = manifestTracking.fileSequenceNumber(); + } + + if (fileSequenceNumber == null) { + this.fileSequenceNumber = manifestTracking.fileSequenceNumber(); + } + } + } + } + + void setManifestLocation(String location) { + this.manifestLocation = location; + } + + @Override + public EntryStatus status() { + return status; + } + + @Override + public Long snapshotId() { + return snapshotId; + } + + @Override + public Long dataSequenceNumber() { + return dataSequenceNumber; + } + + @Override + public Long fileSequenceNumber() { + return fileSequenceNumber; + } + + @Override + public Long dvSnapshotId() { + return dvSnapshotId; + } + + @Override + public Long firstRowId() { + return firstRowId; + } + + @Override + public ByteBuffer deletedPositions() { + return deletedPositions != null ? ByteBuffer.wrap(deletedPositions) : null; + } + + @Override + public ByteBuffer replacedPositions() { + return replacedPositions != null ? ByteBuffer.wrap(replacedPositions) : null; + } + + @Override + public String manifestLocation() { + return manifestLocation; + } + + @Override + public long manifestPos() { + return manifestPos; + } + + @Override + public TrackingStruct copy() { + return new TrackingStruct(this); + } + + @Override + protected T internalGet(int pos, Class javaClass) { + return javaClass.cast(getByPos(pos)); + } + + private Object getByPos(int pos) { + switch (pos) { + case 0: + return status != null ? status.id() : null; + case 1: + return snapshotId(); + case 2: + return dataSequenceNumber(); + case 3: + return fileSequenceNumber(); + case 4: + return dvSnapshotId; + case 5: + return firstRowId; + case 6: + return deletedPositions(); + case 7: + return replacedPositions(); + case 8: + return manifestPos; + default: + throw new UnsupportedOperationException("Unknown field ordinal: " + pos); + } + } + + @Override + protected void internalSet(int pos, T value) { + switch (pos) { + case 0: + this.status = EntryStatus.fromId((Integer) value); + break; + case 1: + this.snapshotId = (Long) value; + break; + case 2: + this.dataSequenceNumber = (Long) value; + break; + case 3: + this.fileSequenceNumber = (Long) value; + break; + case 4: + this.dvSnapshotId = (Long) value; + break; + case 5: + this.firstRowId = (Long) value; + break; + case 6: + this.deletedPositions = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 7: + this.replacedPositions = ByteBuffers.toByteArray((ByteBuffer) value); + break; + case 8: + this.manifestPos = (long) value; + break; + default: + // ignore the object, it must be from a newer version of the format + } + } + + @Override + public String toString() { + return MoreObjects.toStringHelper(this) + .add("status", status) + .add("snapshot_id", snapshotId == null ? "null" : snapshotId) + .add("data_sequence_number", dataSequenceNumber == null ? "null" : dataSequenceNumber) + .add("file_sequence_number", fileSequenceNumber == null ? "null" : fileSequenceNumber) + .add("dv_snapshot_id", dvSnapshotId == null ? "null" : dvSnapshotId) + .add("first_row_id", firstRowId == null ? "null" : firstRowId) + .add("deleted_positions", deletedPositions == null ? "null" : "(binary)") + .add("replaced_positions", replacedPositions == null ? "null" : "(binary)") + .toString(); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java b/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java new file mode 100644 index 000000000000..5ab6b1f3586c --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDeletionVectorStruct { + + @Test + void testFieldAccess() { + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + + dv.set(0, "s3://bucket/data/dv.puffin"); + dv.set(1, 256L); + dv.set(2, 128L); + dv.set(3, 42L); + + assertThat(dv.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(dv.offset()).isEqualTo(256L); + assertThat(dv.sizeInBytes()).isEqualTo(128L); + assertThat(dv.cardinality()).isEqualTo(42L); + } + + @Test + void testCopy() { + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + + dv.set(0, "s3://bucket/data/dv.puffin"); + dv.set(1, 256L); + dv.set(2, 128L); + dv.set(3, 42L); + + DeletionVectorStruct copy = dv.copy(); + + assertThat(copy.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(copy.offset()).isEqualTo(256L); + assertThat(copy.sizeInBytes()).isEqualTo(128L); + assertThat(copy.cardinality()).isEqualTo(42L); + } + + @Test + void testSize() { + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + assertThat(dv.size()).isEqualTo(4); + } + + @Test + void testProjectedStructLike() { + // project only location (field ID 155) and cardinality (field ID 156) + Types.StructType projection = + Types.StructType.of(DeletionVector.LOCATION, DeletionVector.CARDINALITY); + + DeletionVectorStruct dv = new DeletionVectorStruct(projection); + assertThat(dv.size()).isEqualTo(2); + + // projected position 0 maps to internal position 0 (location) + // projected position 1 maps to internal position 3 (cardinality) + dv.set(0, "s3://bucket/data/dv.puffin"); + dv.set(1, 42L); + + assertThat(dv.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(dv.cardinality()).isEqualTo(42L); + assertThat(dv.get(0, String.class)).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(dv.get(1, Long.class)).isEqualTo(42L); + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + dv.set(0, "s3://bucket/data/dv.puffin"); + dv.set(1, 256L); + dv.set(2, 128L); + dv.set(3, 42L); + + DeletionVectorStruct deserialized = TestHelpers.roundTripSerialize(dv); + + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(deserialized.offset()).isEqualTo(256L); + assertThat(deserialized.sizeInBytes()).isEqualTo(128L); + assertThat(deserialized.cardinality()).isEqualTo(42L); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + dv.set(0, "s3://bucket/data/dv.puffin"); + dv.set(1, 256L); + dv.set(2, 128L); + dv.set(3, 42L); + + DeletionVectorStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(dv); + + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/dv.puffin"); + assertThat(deserialized.offset()).isEqualTo(256L); + assertThat(deserialized.sizeInBytes()).isEqualTo(128L); + assertThat(deserialized.cardinality()).isEqualTo(42L); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java b/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java new file mode 100644 index 000000000000..23917de9cd40 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java @@ -0,0 +1,189 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestManifestInfoStruct { + + @Test + void testFieldAccess() { + ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); + + info.set(0, 10); + info.set(1, 20); + info.set(2, 3); + info.set(3, 2); + info.set(4, 1000L); + info.set(5, 2000L); + info.set(6, 300L); + info.set(7, 200L); + info.set(8, 5L); + info.set(9, ByteBuffer.wrap(new byte[] {0xF})); + info.set(10, 1L); + + assertThat(info.addedFilesCount()).isEqualTo(10); + assertThat(info.existingFilesCount()).isEqualTo(20); + assertThat(info.deletedFilesCount()).isEqualTo(3); + assertThat(info.replacedFilesCount()).isEqualTo(2); + assertThat(info.addedRowsCount()).isEqualTo(1000L); + assertThat(info.existingRowsCount()).isEqualTo(2000L); + assertThat(info.deletedRowsCount()).isEqualTo(300L); + assertThat(info.replacedRowsCount()).isEqualTo(200L); + assertThat(info.minSequenceNumber()).isEqualTo(5L); + assertThat(info.dv()).isNotNull(); + assertThat(info.dvCardinality()).isEqualTo(1L); + } + + @Test + void testCopy() { + ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); + + info.set(0, 10); + info.set(1, 20); + info.set(2, 3); + info.set(3, 2); + info.set(4, 1000L); + info.set(5, 2000L); + info.set(6, 300L); + info.set(7, 200L); + info.set(8, 5L); + info.set(9, ByteBuffer.wrap(new byte[] {0xF})); + info.set(10, 1L); + + ManifestInfoStruct copy = info.copy(); + + assertThat(copy.addedFilesCount()).isEqualTo(10); + assertThat(copy.existingFilesCount()).isEqualTo(20); + assertThat(copy.deletedFilesCount()).isEqualTo(3); + assertThat(copy.replacedFilesCount()).isEqualTo(2); + assertThat(copy.addedRowsCount()).isEqualTo(1000L); + assertThat(copy.existingRowsCount()).isEqualTo(2000L); + assertThat(copy.deletedRowsCount()).isEqualTo(300L); + assertThat(copy.replacedRowsCount()).isEqualTo(200L); + assertThat(copy.minSequenceNumber()).isEqualTo(5L); + assertThat(copy.dvCardinality()).isEqualTo(1L); + + // verify deep copy of dv byte array + assertThat(copy.dv().array()).isNotSameAs(info.dv().array()); + } + + @Test + void testNullableFields() { + ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); + + info.set(0, 0); + info.set(1, 0); + info.set(2, 0); + info.set(3, 0); + info.set(4, 0L); + info.set(5, 0L); + info.set(6, 0L); + info.set(7, 0L); + info.set(8, 0L); + + assertThat(info.dv()).isNull(); + assertThat(info.dvCardinality()).isNull(); + } + + @Test + void testProjectedStructLike() { + // project only added_files_count (field ID 504) and min_sequence_number (field ID 516) + Types.StructType projection = + Types.StructType.of(ManifestInfo.ADDED_FILES_COUNT, ManifestInfo.MIN_SEQUENCE_NUMBER); + + ManifestInfoStruct info = new ManifestInfoStruct(projection); + assertThat(info.size()).isEqualTo(2); + + // projected position 0 maps to internal position 0 (added_files_count) + // projected position 1 maps to internal position 8 (min_sequence_number) + info.set(0, 10); + info.set(1, 5L); + + assertThat(info.addedFilesCount()).isEqualTo(10); + assertThat(info.minSequenceNumber()).isEqualTo(5L); + assertThat(info.get(0, Integer.class)).isEqualTo(10); + assertThat(info.get(1, Long.class)).isEqualTo(5L); + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); + info.set(0, 10); + info.set(1, 20); + info.set(2, 3); + info.set(3, 2); + info.set(4, 1000L); + info.set(5, 2000L); + info.set(6, 300L); + info.set(7, 200L); + info.set(8, 5L); + info.set(9, ByteBuffer.wrap(new byte[] {0xF})); + info.set(10, 1L); + + ManifestInfoStruct deserialized = TestHelpers.roundTripSerialize(info); + + assertThat(deserialized.addedFilesCount()).isEqualTo(10); + assertThat(deserialized.existingFilesCount()).isEqualTo(20); + assertThat(deserialized.deletedFilesCount()).isEqualTo(3); + assertThat(deserialized.replacedFilesCount()).isEqualTo(2); + assertThat(deserialized.addedRowsCount()).isEqualTo(1000L); + assertThat(deserialized.existingRowsCount()).isEqualTo(2000L); + assertThat(deserialized.deletedRowsCount()).isEqualTo(300L); + assertThat(deserialized.replacedRowsCount()).isEqualTo(200L); + assertThat(deserialized.minSequenceNumber()).isEqualTo(5L); + assertThat(deserialized.dv()).isEqualTo(ByteBuffer.wrap(new byte[] {0xF})); + assertThat(deserialized.dvCardinality()).isEqualTo(1L); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); + info.set(0, 10); + info.set(1, 20); + info.set(2, 3); + info.set(3, 2); + info.set(4, 1000L); + info.set(5, 2000L); + info.set(6, 300L); + info.set(7, 200L); + info.set(8, 5L); + info.set(9, ByteBuffer.wrap(new byte[] {0xF})); + info.set(10, 1L); + + ManifestInfoStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(info); + + assertThat(deserialized.addedFilesCount()).isEqualTo(10); + assertThat(deserialized.existingFilesCount()).isEqualTo(20); + assertThat(deserialized.deletedFilesCount()).isEqualTo(3); + assertThat(deserialized.replacedFilesCount()).isEqualTo(2); + assertThat(deserialized.addedRowsCount()).isEqualTo(1000L); + assertThat(deserialized.existingRowsCount()).isEqualTo(2000L); + assertThat(deserialized.deletedRowsCount()).isEqualTo(300L); + assertThat(deserialized.replacedRowsCount()).isEqualTo(200L); + assertThat(deserialized.minSequenceNumber()).isEqualTo(5L); + assertThat(deserialized.dv()).isEqualTo(ByteBuffer.wrap(new byte[] {0xF})); + assertThat(deserialized.dvCardinality()).isEqualTo(1L); + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java new file mode 100644 index 000000000000..05013ae54e79 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java @@ -0,0 +1,376 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Set; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestTrackedFileStruct { + @Test + void testFieldAccess() { + TrackedFileStruct file = new TrackedFileStruct(); + TrackingStruct tracking = new TrackingStruct(); + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); + + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 42L); + + dv.set(0, "s3://bucket/dv.puffin"); + dv.set(1, 100L); + dv.set(2, 50L); + dv.set(3, 5L); + + info.set(0, 10); + info.set(1, 20); + info.set(2, 3); + info.set(3, 2); + info.set(4, 1000L); + info.set(5, 2000L); + info.set(6, 300L); + info.set(7, 200L); + info.set(8, 5L); + + file.set(0, tracking); + file.set(1, FileContent.EQUALITY_DELETES.id()); + file.set(2, "s3://bucket/data/eq-delete.avro"); + file.set(3, "avro"); + file.set(4, 50L); + file.set(5, 512L); + file.set(6, 1); + file.set(8, 5); + file.set(9, dv); + file.set(10, info); + file.set(11, ByteBuffer.wrap(new byte[] {1, 2, 3})); + file.set(12, ImmutableList.of(100L, 200L)); + file.set(13, ImmutableList.of(1, 2, 3)); + + assertThat(file.tracking()).isNotNull(); + assertThat(file.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(file.tracking().snapshotId()).isEqualTo(42L); + assertThat(file.contentType()).isEqualTo(FileContent.EQUALITY_DELETES); + assertThat(file.location()).isEqualTo("s3://bucket/data/eq-delete.avro"); + assertThat(file.fileFormat()).isEqualTo(FileFormat.AVRO); + assertThat(file.recordCount()).isEqualTo(50L); + assertThat(file.fileSizeInBytes()).isEqualTo(512L); + assertThat(file.specId()).isEqualTo(1); + assertThat(file.sortOrderId()).isEqualTo(5); + assertThat(file.deletionVector()).isSameAs(dv); + assertThat(file.manifestInfo()).isSameAs(info); + assertThat(file.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2, 3})); + assertThat(file.splitOffsets()).containsExactly(100L, 200L); + assertThat(file.equalityIds()).containsExactly(1, 2, 3); + } + + @Test + void testReaderSideFields() { + TrackedFileStruct file = new TrackedFileStruct(); + + TrackingStruct tracking = new TrackingStruct(); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.setManifestLocation("s3://bucket/metadata/manifest.avro"); + tracking.set(8, 7L); + + file.set(0, tracking); + file.set(1, FileContent.DATA.id()); + file.set(2, "test"); + file.set(3, "parquet"); + file.set(4, 0L); + file.set(5, 0L); + + assertThat(file.tracking().manifestLocation()).isEqualTo("s3://bucket/metadata/manifest.avro"); + assertThat(file.tracking().manifestPos()).isEqualTo(7L); + } + + @Test + void testCopy() { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFile copy = file.copy(); + assertThat(copy).isInstanceOf(TrackedFileStruct.class); + + assertThat(copy.contentType()).isEqualTo(FileContent.DATA); + assertThat(copy.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(copy.fileFormat()).isEqualTo(FileFormat.PARQUET); + assertThat(copy.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(copy.tracking().snapshotId()).isEqualTo(42L); + assertThat(copy.deletionVector().location()).isEqualTo("s3://bucket/dv.puffin"); + assertThat(copy.specId()).isEqualTo(0); + assertThat(copy.sortOrderId()).isEqualTo(1); + assertThat(copy.recordCount()).isEqualTo(100L); + assertThat(copy.fileSizeInBytes()).isEqualTo(1024L); + assertThat(copy.keyMetadata()).isNotNull(); + assertThat(copy.splitOffsets()).containsExactly(50L); + assertThat(copy.equalityIds()).isNull(); + assertThat(copy.tracking().manifestLocation()).isEqualTo("s3://bucket/manifest.avro"); + assertThat(copy.tracking().manifestPos()).isEqualTo(3L); + } + + @Test + void testCopyWithoutStats() { + TrackedFileStruct file = createTrackedFileWithStats(); + assertThat(file.contentStats()).isNotNull(); + + TrackedFile copy = file.copyWithoutStats(); + + assertThat(copy.contentType()).isEqualTo(FileContent.DATA); + assertThat(copy.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(copy.contentStats()).isNull(); + } + + @Test + void testCopyWithStatsFilters() { + TrackedFileStruct file = createTrackedFileWithStats(); + Set keepFieldIds = ImmutableSet.of(1); + + TrackedFile copy = file.copyWithStats(keepFieldIds); + + assertThat(copy.contentStats()).isNotNull(); + ContentStats stats = copy.contentStats(); + assertThat(stats.fieldStats()).hasSize(1); + assertThat(stats.fieldStats().get(0).fieldId()).isEqualTo(1); + } + + @Test + void testCopyIsDeep() { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFile copy = file.copy(); + + // keyMetadata should be a deep copy + assertThat(copy.keyMetadata()).isNotSameAs(file.keyMetadata()); + } + + @Test + void testStructLikeSize() { + TrackedFileStruct file = new TrackedFileStruct(); + assertThat(file.size()).isEqualTo(14); + } + + @Test + void testStructLikeGetSet() { + TrackedFileStruct file = new TrackedFileStruct(); + + file.set(1, FileContent.DATA.id()); + assertThat(file.get(1, Integer.class)).isEqualTo(FileContent.DATA.id()); + + file.set(2, "test-location"); + assertThat(file.get(2, String.class)).isEqualTo("test-location"); + + file.set(4, 999L); + assertThat(file.get(4, Long.class)).isEqualTo(999L); + } + + @Test + void testProjectedStructLike() { + // project only location (field ID 100) and file_size_in_bytes (field ID 104) + Types.StructType projection = + Types.StructType.of(TrackedFile.LOCATION, TrackedFile.FILE_SIZE_IN_BYTES); + + TrackedFileStruct file = new TrackedFileStruct(projection); + assertThat(file.size()).isEqualTo(2); + + // projected position 0 maps to internal position 2 (location) + // projected position 1 maps to internal position 5 (file_size_in_bytes) + file.set(0, "s3://bucket/file.parquet"); + file.set(1, 1024L); + + assertThat(file.location()).isEqualTo("s3://bucket/file.parquet"); + assertThat(file.fileSizeInBytes()).isEqualTo(1024L); + assertThat(file.get(0, String.class)).isEqualTo("s3://bucket/file.parquet"); + assertThat(file.get(1, Long.class)).isEqualTo(1024L); + } + + @Test + void testContentStatsReturnedWhenPresent() { + TrackedFileStruct file = createTrackedFileWithStats(); + assertThat(file.contentStats()).isNotNull(); + assertThat(file.contentStats().fieldStats()).hasSize(2); + } + + @Test + void testContentStatsNullWhenNotSet() { + TrackedFileStruct file = new TrackedFileStruct(); + file.set(1, FileContent.DATA.id()); + file.set(2, "test"); + file.set(3, "parquet"); + file.set(4, 0L); + file.set(5, 0L); + file.set(6, 0); + + assertThat(file.contentStats()).isNull(); + } + + @Test + void testAllFileContentTypesSupported() { + for (FileContent content : FileContent.values()) { + TrackedFileStruct file = new TrackedFileStruct(); + file.set(1, content.id()); + assertThat(file.contentType()).isEqualTo(content); + } + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFileStruct deserialized = TestHelpers.roundTripSerialize(file); + + assertThat(deserialized.contentType()).isEqualTo(FileContent.DATA); + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(deserialized.fileFormat()).isEqualTo(FileFormat.PARQUET); + assertThat(deserialized.recordCount()).isEqualTo(100L); + assertThat(deserialized.fileSizeInBytes()).isEqualTo(1024L); + assertThat(deserialized.specId()).isEqualTo(0); + assertThat(deserialized.sortOrderId()).isEqualTo(1); + assertThat(deserialized.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.tracking().snapshotId()).isEqualTo(42L); + assertThat(deserialized.deletionVector().location()).isEqualTo("s3://bucket/dv.puffin"); + assertThat(deserialized.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2, 3})); + assertThat(deserialized.splitOffsets()).containsExactly(50L); + assertThat(deserialized.tracking().manifestPos()).isEqualTo(3L); + assertThat(deserialized.tracking().manifestLocation()).isEqualTo("s3://bucket/manifest.avro"); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + TrackedFileStruct file = createFullTrackedFile(); + + TrackedFileStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(file); + + assertThat(deserialized.contentType()).isEqualTo(FileContent.DATA); + assertThat(deserialized.location()).isEqualTo("s3://bucket/data/file.parquet"); + assertThat(deserialized.fileFormat()).isEqualTo(FileFormat.PARQUET); + assertThat(deserialized.recordCount()).isEqualTo(100L); + assertThat(deserialized.fileSizeInBytes()).isEqualTo(1024L); + assertThat(deserialized.specId()).isEqualTo(0); + assertThat(deserialized.sortOrderId()).isEqualTo(1); + assertThat(deserialized.tracking().status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.tracking().snapshotId()).isEqualTo(42L); + assertThat(deserialized.deletionVector().location()).isEqualTo("s3://bucket/dv.puffin"); + assertThat(deserialized.keyMetadata()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2, 3})); + assertThat(deserialized.splitOffsets()).containsExactly(50L); + assertThat(deserialized.tracking().manifestPos()).isEqualTo(3L); + assertThat(deserialized.tracking().manifestLocation()).isEqualTo("s3://bucket/manifest.avro"); + } + + static TrackedFileStruct createFullTrackedFile() { + TrackingStruct tracking = new TrackingStruct(); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 42L); + tracking.set(2, 10L); + tracking.setManifestLocation("s3://bucket/manifest.avro"); + tracking.set(8, 3L); + + DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); + dv.set(0, "s3://bucket/dv.puffin"); + dv.set(1, 100L); + dv.set(2, 50L); + dv.set(3, 5L); + + TrackedFileStruct file = + new TrackedFileStruct( + tracking, + FileContent.DATA, + "s3://bucket/data/file.parquet", + FileFormat.PARQUET, + 100L, + 1024L); + file.set(6, 0); + file.set(8, 1); + file.set(9, dv); + file.set(11, ByteBuffer.wrap(new byte[] {1, 2, 3})); + file.set(12, ImmutableList.of(50L)); + + return file; + } + + @SuppressWarnings("unchecked") + static TrackedFileStruct createTrackedFileWithStats() { + Types.StructType statsStruct = + Types.StructType.of( + Types.NestedField.optional( + 10000, + "1", + Types.StructType.of( + Types.NestedField.optional(10001, "value_count", Types.LongType.get()), + Types.NestedField.optional(10002, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(10003, "nan_value_count", Types.LongType.get()), + Types.NestedField.optional(10006, "lower_bound", Types.IntegerType.get()), + Types.NestedField.optional(10007, "upper_bound", Types.IntegerType.get()))), + Types.NestedField.optional( + 20000, + "2", + Types.StructType.of( + Types.NestedField.optional(20001, "value_count", Types.LongType.get()), + Types.NestedField.optional(20002, "null_value_count", Types.LongType.get()), + Types.NestedField.optional(20003, "nan_value_count", Types.LongType.get()), + Types.NestedField.optional(20006, "lower_bound", Types.FloatType.get()), + Types.NestedField.optional(20007, "upper_bound", Types.FloatType.get())))); + + List> fieldStatsList = + ImmutableList.of( + (FieldStats) + BaseFieldStats.builder() + .fieldId(1) + .type(Types.IntegerType.get()) + .valueCount(100L) + .nullValueCount(5L) + .lowerBound(1) + .upperBound(1000) + .build(), + (FieldStats) + BaseFieldStats.builder() + .fieldId(2) + .type(Types.FloatType.get()) + .valueCount(200L) + .nullValueCount(10L) + .nanValueCount(3L) + .lowerBound(1.0f) + .upperBound(100.0f) + .build()); + + BaseContentStats stats = + BaseContentStats.builder() + .withStatsStruct(statsStruct) + .withFieldStats(fieldStatsList) + .build(); + + TrackedFileStruct file = + new TrackedFileStruct( + null, + FileContent.DATA, + "s3://bucket/data/file.parquet", + FileFormat.PARQUET, + 100L, + 1024L); + file.set(6, 0); + file.set(7, stats); + + return file; + } +} diff --git a/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java new file mode 100644 index 000000000000..2a4ace59247f --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java @@ -0,0 +1,220 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.ByteBuffer; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EnumSource; + +class TestTrackingStruct { + + @Test + void testFieldAccess() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 42L); + tracking.set(2, 10L); + tracking.set(3, 11L); + tracking.set(4, 43L); + tracking.set(5, 1000L); + + assertThat(tracking.status()).isEqualTo(EntryStatus.ADDED); + assertThat(tracking.snapshotId()).isEqualTo(42L); + assertThat(tracking.dataSequenceNumber()).isEqualTo(10L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(11L); + assertThat(tracking.dvSnapshotId()).isEqualTo(43L); + assertThat(tracking.firstRowId()).isEqualTo(1000L); + assertThat(tracking.deletedPositions()).isNull(); + assertThat(tracking.replacedPositions()).isNull(); + } + + @Test + void testCopy() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 42L); + tracking.set(2, 10L); + tracking.set(6, ByteBuffer.wrap(new byte[] {1, 2})); + + TrackingStruct copy = tracking.copy(); + + assertThat(copy.status()).isEqualTo(EntryStatus.ADDED); + assertThat(copy.snapshotId()).isEqualTo(42L); + assertThat(copy.dataSequenceNumber()).isEqualTo(10L); + assertThat(copy.deletedPositions()).isNotNull(); + + // verify deep copy of ByteBuffer + assertThat(copy.deletedPositions()).isNotSameAs(tracking.deletedPositions()); + } + + @ParameterizedTest + @EnumSource(EntryStatus.class) + void testAllStatuses(EntryStatus status) { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, status.id()); + assertThat(tracking.status()).isEqualTo(status); + } + + @Test + void testIsLive() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + + tracking.set(0, EntryStatus.ADDED.id()); + assertThat(tracking.isLive()).isTrue(); + + tracking.set(0, EntryStatus.EXISTING.id()); + assertThat(tracking.isLive()).isTrue(); + + tracking.set(0, EntryStatus.DELETED.id()); + assertThat(tracking.isLive()).isFalse(); + + tracking.set(0, EntryStatus.REPLACED.id()); + assertThat(tracking.isLive()).isFalse(); + } + + @Test + void testInheritSnapshotId() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + + // snapshotId is null, should inherit from manifest + assertThat(tracking.snapshotId()).isEqualTo(100L); + } + + @Test + void testInheritSequenceNumberForAddedEntries() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + + // sequence numbers are null and status is ADDED, should inherit from file sequence number + assertThat(tracking.dataSequenceNumber()).isEqualTo(60L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(60L); + } + + @Test + void testDoNotInheritSequenceNumberForExistingEntries() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.EXISTING.id()); + tracking.set(2, 5L); + tracking.set(3, 6L); + tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + + // sequence numbers are not inherited for EXISTING entries + assertThat(tracking.dataSequenceNumber()).isEqualTo(5L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(6L); + } + + @Test + void testExplicitValuesOverrideInheritance() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 200L); + tracking.set(2, 75L); + tracking.set(3, 76L); + tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + + // explicit values should take precedence + assertThat(tracking.snapshotId()).isEqualTo(200L); + assertThat(tracking.dataSequenceNumber()).isEqualTo(75L); + assertThat(tracking.fileSequenceNumber()).isEqualTo(76L); + } + + @Test + void testNoDefaultingWithoutInheritance() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + + // no inheritance, nulls stay null + assertThat(tracking.snapshotId()).isNull(); + assertThat(tracking.dataSequenceNumber()).isNull(); + assertThat(tracking.fileSequenceNumber()).isNull(); + } + + // uses distinct data and file sequence numbers to verify that inheritance uses file sequence + // number + private static Tracking createManifestTracking( + long snapshotId, long dataSequenceNumber, long fileSequenceNumber) { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, snapshotId); + tracking.set(2, dataSequenceNumber); + tracking.set(3, fileSequenceNumber); + return tracking; + } + + @Test + void testProjectedStructLike() { + // project only snapshot_id (field ID 1) and first_row_id (field ID 142) + Types.StructType projection = Types.StructType.of(Tracking.SNAPSHOT_ID, Tracking.FIRST_ROW_ID); + + TrackingStruct tracking = new TrackingStruct(projection); + assertThat(tracking.size()).isEqualTo(2); + + // projected position 0 maps to internal position 1 (snapshot_id) + // projected position 1 maps to internal position 5 (first_row_id) + tracking.set(0, 42L); + tracking.set(1, 1000L); + + assertThat(tracking.snapshotId()).isEqualTo(42L); + assertThat(tracking.firstRowId()).isEqualTo(1000L); + assertThat(tracking.get(0, Long.class)).isEqualTo(42L); + assertThat(tracking.get(1, Long.class)).isEqualTo(1000L); + } + + @Test + void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 42L); + tracking.set(2, 10L); + tracking.set(6, ByteBuffer.wrap(new byte[] {1, 2})); + + TrackingStruct deserialized = TestHelpers.roundTripSerialize(tracking); + + assertThat(deserialized.status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.snapshotId()).isEqualTo(42L); + assertThat(deserialized.dataSequenceNumber()).isEqualTo(10L); + assertThat(deserialized.deletedPositions()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2})); + } + + @Test + void testKryoSerializationRoundTrip() throws IOException { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + tracking.set(1, 42L); + tracking.set(2, 10L); + tracking.set(6, ByteBuffer.wrap(new byte[] {1, 2})); + + TrackingStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(tracking); + + assertThat(deserialized.status()).isEqualTo(EntryStatus.ADDED); + assertThat(deserialized.snapshotId()).isEqualTo(42L); + assertThat(deserialized.dataSequenceNumber()).isEqualTo(10L); + assertThat(deserialized.deletedPositions()).isEqualTo(ByteBuffer.wrap(new byte[] {1, 2})); + } +} From 8f611675e343c845c9df6e56718f1492aeb143e0 Mon Sep 17 00:00:00 2001 From: Anoop Johnson Date: Thu, 23 Apr 2026 16:53:45 -0700 Subject: [PATCH 092/197] Validate manifest sequence numbers are equal during inheritance (#16091) Manifests do not distinguish between data and file sequence numbers. Add a check that they are equal when inheriting tracking metadata. --- .../org/apache/iceberg/TrackingStruct.java | 12 +++++-- .../apache/iceberg/TestTrackingStruct.java | 36 +++++++++++++------ 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/TrackingStruct.java b/core/src/main/java/org/apache/iceberg/TrackingStruct.java index 05f51360825b..a8624aad15c1 100644 --- a/core/src/main/java/org/apache/iceberg/TrackingStruct.java +++ b/core/src/main/java/org/apache/iceberg/TrackingStruct.java @@ -21,8 +21,10 @@ import java.io.Serializable; import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Objects; import org.apache.iceberg.avro.SupportsIndexProjection; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -87,8 +89,14 @@ void inheritFrom(Tracking manifestTracking) { this.snapshotId = manifestTracking.snapshotId(); } - // both sequence numbers inherit from file sequence number because manifests - // do not distinguish between data and file sequence numbers + // manifests do not distinguish between data and file sequence numbers + Preconditions.checkArgument( + Objects.equals( + manifestTracking.dataSequenceNumber(), manifestTracking.fileSequenceNumber()), + "Manifest data and file sequence numbers must be equal, got %s and %s", + manifestTracking.dataSequenceNumber(), + manifestTracking.fileSequenceNumber()); + if (status == EntryStatus.ADDED) { if (dataSequenceNumber == null) { this.dataSequenceNumber = manifestTracking.fileSequenceNumber(); diff --git a/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java index 2a4ace59247f..5af41d0dcf02 100644 --- a/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java +++ b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java @@ -19,6 +19,7 @@ package org.apache.iceberg; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.IOException; import java.nio.ByteBuffer; @@ -99,7 +100,7 @@ void testIsLive() { void testInheritSnapshotId() { TrackingStruct tracking = new TrackingStruct(Tracking.schema()); tracking.set(0, EntryStatus.ADDED.id()); - tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + tracking.inheritFrom(createManifestTracking(100L, 60L)); // snapshotId is null, should inherit from manifest assertThat(tracking.snapshotId()).isEqualTo(100L); @@ -109,9 +110,9 @@ void testInheritSnapshotId() { void testInheritSequenceNumberForAddedEntries() { TrackingStruct tracking = new TrackingStruct(Tracking.schema()); tracking.set(0, EntryStatus.ADDED.id()); - tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + tracking.inheritFrom(createManifestTracking(100L, 60L)); - // sequence numbers are null and status is ADDED, should inherit from file sequence number + // sequence numbers are null and status is ADDED, should inherit assertThat(tracking.dataSequenceNumber()).isEqualTo(60L); assertThat(tracking.fileSequenceNumber()).isEqualTo(60L); } @@ -122,7 +123,7 @@ void testDoNotInheritSequenceNumberForExistingEntries() { tracking.set(0, EntryStatus.EXISTING.id()); tracking.set(2, 5L); tracking.set(3, 6L); - tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + tracking.inheritFrom(createManifestTracking(100L, 60L)); // sequence numbers are not inherited for EXISTING entries assertThat(tracking.dataSequenceNumber()).isEqualTo(5L); @@ -136,7 +137,7 @@ void testExplicitValuesOverrideInheritance() { tracking.set(1, 200L); tracking.set(2, 75L); tracking.set(3, 76L); - tracking.inheritFrom(createManifestTracking(100L, 50L, 60L)); + tracking.inheritFrom(createManifestTracking(100L, 60L)); // explicit values should take precedence assertThat(tracking.snapshotId()).isEqualTo(200L); @@ -144,6 +145,22 @@ void testExplicitValuesOverrideInheritance() { assertThat(tracking.fileSequenceNumber()).isEqualTo(76L); } + @Test + void testInheritFromRejectsUnequalSequenceNumbers() { + TrackingStruct tracking = new TrackingStruct(Tracking.schema()); + tracking.set(0, EntryStatus.ADDED.id()); + + TrackingStruct manifestTracking = new TrackingStruct(Tracking.schema()); + manifestTracking.set(0, EntryStatus.ADDED.id()); + manifestTracking.set(1, 100L); + manifestTracking.set(2, 50L); + manifestTracking.set(3, 60L); + + assertThatThrownBy(() -> tracking.inheritFrom(manifestTracking)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Manifest data and file sequence numbers must be equal, got 50 and 60"); + } + @Test void testNoDefaultingWithoutInheritance() { TrackingStruct tracking = new TrackingStruct(Tracking.schema()); @@ -155,15 +172,12 @@ void testNoDefaultingWithoutInheritance() { assertThat(tracking.fileSequenceNumber()).isNull(); } - // uses distinct data and file sequence numbers to verify that inheritance uses file sequence - // number - private static Tracking createManifestTracking( - long snapshotId, long dataSequenceNumber, long fileSequenceNumber) { + private static Tracking createManifestTracking(long snapshotId, long sequenceNumber) { TrackingStruct tracking = new TrackingStruct(Tracking.schema()); tracking.set(0, EntryStatus.ADDED.id()); tracking.set(1, snapshotId); - tracking.set(2, dataSequenceNumber); - tracking.set(3, fileSequenceNumber); + tracking.set(2, sequenceNumber); + tracking.set(3, sequenceNumber); return tracking; } From 0e4f447ee0d0d22effd49d7189540651af66a3c5 Mon Sep 17 00:00:00 2001 From: GuoYu <511955993@qq.com> Date: Fri, 24 Apr 2026 20:14:39 +0800 Subject: [PATCH 093/197] Data: Add TCK tests for metrics collection in BaseFormatModelTests (#15906) --- .../iceberg/data/BaseFormatModelTests.java | 581 +++++++++++++++++- .../apache/iceberg/data/DataGenerators.java | 12 + 2 files changed, 590 insertions(+), 3 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java index 28034933a8f3..8a47132975be 100644 --- a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java +++ b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java @@ -26,18 +26,27 @@ import static org.assertj.core.api.Assumptions.assumeThat; import static org.junit.jupiter.api.Assumptions.assumeFalse; +import java.io.File; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Comparator; import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.function.BiConsumer; import java.util.stream.IntStream; +import org.apache.iceberg.ContentFile; import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.MetricsModes; +import org.apache.iceberg.MetricsModes.MetricsMode; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestTables; import org.apache.iceberg.deletes.EqualityDeleteWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.deletes.PositionDeleteWriter; @@ -56,9 +65,15 @@ import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.Conversions; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.FieldSource; @@ -77,6 +92,8 @@ protected boolean supportsBatchReads() { return false; } + @TempDir private File tableDir; + private static final FileFormat[] FILE_FORMATS = new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; @@ -92,13 +109,21 @@ protected boolean supportsBatchReads() { static final String FEATURE_CASE_SENSITIVE = "caseSensitive"; static final String FEATURE_SPLIT = "split"; static final String FEATURE_REUSE_CONTAINERS = "reuseContainers"; + static final String FEATURE_COLUMN_LEVEL_METRICS = "columnLevelMetrics"; + static final String FEATURE_COLUMN_METRICS_TRUNCATE_BINARY = "columnMetricsTruncateBinary"; private static final Map MISSING_FEATURES = Map.of( FileFormat.AVRO, - new String[] {FEATURE_FILTER, FEATURE_CASE_SENSITIVE, FEATURE_SPLIT}, + new String[] { + FEATURE_FILTER, + FEATURE_CASE_SENSITIVE, + FEATURE_SPLIT, + FEATURE_COLUMN_LEVEL_METRICS, + FEATURE_COLUMN_METRICS_TRUNCATE_BINARY + }, FileFormat.ORC, - new String[] {FEATURE_REUSE_CONTAINERS}); + new String[] {FEATURE_REUSE_CONTAINERS, FEATURE_COLUMN_METRICS_TRUNCATE_BINARY}); private InMemoryFileIO fileIO; private EncryptedOutputFile encryptedFile; @@ -123,6 +148,8 @@ void after() { if (fileIO != null) { fileIO.close(); } + + TestTables.clearTables(); } /** Write with engine type T, read with Generic Record */ @@ -609,6 +636,273 @@ void testReaderBuilderRecordsPerBatchNotSupported(FileFormat fileFormat) throws .isInstanceOf(UnsupportedOperationException.class); } + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsCollection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords); + + assertCounts(fileFormat, schema, genericRecords, dataFile); + assertBounds(fileFormat, schema, genericRecords, dataFile); + assertColumnSize(fileFormat, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithNoneMode(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + + MetricsConfig noneConfig = config(schema, MetricsModes.None.get()); + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords, noneConfig); + + assertCountsNull(schema, dataFile); + assertBoundsNull(schema, dataFile); + assertColumnSizeEmpty(fileFormat, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithCountsMode(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + MetricsConfig countsConfig = config(schema, MetricsModes.Counts.get()); + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords, countsConfig); + + // In the counts mode, valueCounts and nullValueCounts should be present, while lowerBounds and + // upperBounds should be null. + assertCounts(fileFormat, schema, genericRecords, dataFile); + assertBoundsNull(schema, dataFile); + assertColumnSize(fileFormat, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithTruncateMode(FileFormat fileFormat) throws IOException { + int truncateLength = 5; + Schema schema = + new Schema( + Types.NestedField.required(1, "col_str", Types.StringType.get()), + Types.NestedField.required(2, "col_int", Types.IntegerType.get())); + + List records = Lists.newArrayList(); + records.add(GenericRecord.create(schema).copy("col_str", "abcdefghij", "col_int", 10)); + records.add(GenericRecord.create(schema).copy("col_str", "abcdezyxwv", "col_int", 20)); + records.add(GenericRecord.create(schema).copy("col_str", "abcdeAAAAA", "col_int", 5)); + + assertTruncateBoundsForFirstColumn( + fileFormat, + schema, + records, + truncateLength, + FEATURE_COLUMN_LEVEL_METRICS, + (lower, upper) -> { + // Lower bound: "abcdeAAAAA" truncated to "abcde" + CharSequence actualLower = Conversions.fromByteBuffer(Types.StringType.get(), lower); + assertThat(actualLower.toString()).hasSize(truncateLength); + assertThat(actualLower.toString()).isEqualTo("abcde"); + + // Upper bound: "abcdezyxwv" truncated and incremented to "abcdf" + CharSequence actualUpper = Conversions.fromByteBuffer(Types.StringType.get(), upper); + assertThat(actualUpper.toString()).hasSize(truncateLength); + assertThat(actualUpper.toString()).isEqualTo("abcdf"); + }); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithTruncateModeForBinary(FileFormat fileFormat) throws IOException { + int truncateLength = 5; + Schema schema = + new Schema( + Types.NestedField.required(1, "col_bin", Types.BinaryType.get()), + Types.NestedField.required(2, "col_int", Types.IntegerType.get())); + + List records = Lists.newArrayList(); + records.add( + GenericRecord.create(schema) + .copy( + "col_bin", + ByteBuffer.wrap( + new byte[] {0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0x10, 0xA, 0xB}), + "col_int", + 10)); + + assertTruncateBoundsForFirstColumn( + fileFormat, + schema, + records, + truncateLength, + FEATURE_COLUMN_METRICS_TRUNCATE_BINARY, + (lower, upper) -> { + ByteBuffer actualLower = Conversions.fromByteBuffer(Types.BinaryType.get(), lower); + ByteBuffer actualUpper = Conversions.fromByteBuffer(Types.BinaryType.get(), upper); + + ByteBuffer expectedLower = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x5}); + ByteBuffer expectedUpper = ByteBuffer.wrap(new byte[] {0x1, 0x2, 0x3, 0x4, 0x6}); + + assertThat(actualLower).isEqualTo(expectedLower); + assertThat(actualUpper).isEqualTo(expectedUpper); + }); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testEqualityDeleteWriterMetricsCollection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + FileWriterBuilder, Object> writerBuilder = + FormatModelRegistry.equalityDeleteWriteBuilder(fileFormat, Record.class, encryptedFile); + + EqualityDeleteWriter writer = + writerBuilder + .schema(schema) + .spec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .build(); + + List genericRecords = dataGenerator.generateRecords(); + + try (writer) { + genericRecords.forEach(writer::write); + } + + DeleteFile deleteFile = writer.toDeleteFile(); + + assertCounts(fileFormat, schema, genericRecords, deleteFile); + assertBounds(fileFormat, schema, genericRecords, deleteFile); + assertColumnSize(fileFormat, deleteFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testPositionDeleteWriterMetricsSingleFile(FileFormat fileFormat) throws IOException { + // Single file reference: counts are removed but bounds are preserved. + List> deletes = + ImmutableList.of( + PositionDelete.create().set("d-file-1.file", 0L), + PositionDelete.create().set("d-file-1.file", 5L), + PositionDelete.create().set("d-file-1.file", 3L)); + + DeleteFile deleteFile = writePositionDeletes(fileFormat, deletes); + assertPositionDeleteMetrics(fileFormat, deletes, deleteFile, true /* checkBounds */); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testPositionDeleteWriterMetricsMultipleFiles(FileFormat fileFormat) throws IOException { + // Multiple file references: both counts and bounds are removed. + List> deletes = + ImmutableList.of( + PositionDelete.create().set("d-file-1.file", 0L), + PositionDelete.create().set("d-file-1.file", 5L), + PositionDelete.create().set("d-file-2.file", 3L)); + + DeleteFile deleteFile = writePositionDeletes(fileFormat, deletes); + assertPositionDeleteMetrics(fileFormat, deletes, deleteFile, false /* checkBounds */); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterMetricsWithPerColumnMode(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + + // Default mode is "counts", col_b is overridden to "full", col_a is overridden to "none" + MetricsConfig perColumnConfig = + config( + schema, + MetricsModes.Counts.get(), + ImmutableMap.of("col_b", MetricsModes.Full.get(), "col_a", MetricsModes.None.get())); + + List genericRecords = dataGenerator.generateRecords(); + DataFile dataFile = writeGenericRecords(fileFormat, schema, genericRecords, perColumnConfig); + + // col_a: mode=none -> no valueCounts, nullValueCounts, bounds + Schema noneSchema = new Schema(schema.findField("col_a")); + assertCountsNull(noneSchema, dataFile); + assertBoundsNull(noneSchema, dataFile); + + // col_b: mode=full -> valueCounts, nullValueCounts, and bounds all present + Schema fullSchema = new Schema(schema.findField("col_b")); + assertCounts(fileFormat, fullSchema, genericRecords, dataFile); + assertBounds(fileFormat, fullSchema, genericRecords, dataFile); + + // col_c, col_d, col_e: mode=counts (default) -> valueCounts and nullValueCounts present, + // but no bounds + Schema countsSchema = + new Schema(schema.findField("col_c"), schema.findField("col_d"), schema.findField("col_e")); + assertCounts(fileFormat, countsSchema, genericRecords, dataFile); + assertBoundsNull(countsSchema, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterNanMetrics(FileFormat fileFormat) throws IOException { + Schema schema = new DataGenerators.FloatDoubleSchema().schema(); + + List records = Lists.newArrayList(); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + records.add(GenericRecord.create(schema).copy("col_float", 1.0F, "col_double", 10.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 5.0F, "col_double", 50.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 3.0F, "col_double", 30.0D)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records); + + assertCounts(fileFormat, schema, records, dataFile); + assertBounds(fileFormat, schema, records, dataFile); + assertNanCounts(fileFormat, schema, records, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterNanSortingOrder(FileFormat fileFormat) throws IOException { + Schema schema = new DataGenerators.FloatDoubleSchema().schema(); + + List records = Lists.newArrayList(); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + records.add( + GenericRecord.create(schema) + .copy("col_float", Float.NEGATIVE_INFINITY, "col_double", Double.NEGATIVE_INFINITY)); + records.add(GenericRecord.create(schema).copy("col_float", -1.0F, "col_double", -1.0D)); + records.add(GenericRecord.create(schema).copy("col_float", -0.0F, "col_double", -0.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 0.0F, "col_double", 0.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 1.0F, "col_double", 1.0D)); + records.add( + GenericRecord.create(schema) + .copy("col_float", Float.POSITIVE_INFINITY, "col_double", Double.POSITIVE_INFINITY)); + records.add( + GenericRecord.create(schema).copy("col_float", Float.NaN, "col_double", Double.NaN)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records); + + // Bounds should exclude NaN: float/double lower = -Infinity, upper = +Infinity + assertBounds(fileFormat, schema, records, dataFile); + assertNanCounts(fileFormat, schema, records, dataFile); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testDataWriterNegativeZeroBounds(FileFormat fileFormat) throws IOException { + Schema schema = new DataGenerators.FloatDoubleSchema().schema(); + + List records = Lists.newArrayList(); + records.add(GenericRecord.create(schema).copy("col_float", -0.0F, "col_double", -0.0D)); + records.add(GenericRecord.create(schema).copy("col_float", 0.0F, "col_double", 0.0D)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records); + assertBounds(fileFormat, schema, records, dataFile); + } + private void readAndAssertGenericRecords( FileFormat fileFormat, Schema schema, List expected) throws IOException { InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); @@ -619,14 +913,25 @@ private void readAndAssertGenericRecords( .build()) { readRecords = ImmutableList.copyOf(reader); } + DataTestHelpers.assertEquals(schema.asStruct(), expected, readRecords); } - private void writeGenericRecords(FileFormat fileFormat, Schema schema, List records) + private DataFile writeGenericRecords(FileFormat fileFormat, Schema schema, List records) + throws IOException { + return writeGenericRecords(fileFormat, schema, records, null); + } + + private DataFile writeGenericRecords( + FileFormat fileFormat, Schema schema, List records, MetricsConfig metricsConfig) throws IOException { FileWriterBuilder, Object> writerBuilder = FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile); + if (metricsConfig != null) { + writerBuilder.metricsConfig(metricsConfig); + } + DataWriter writer = writerBuilder.schema(schema).spec(PartitionSpec.unpartitioned()).build(); @@ -638,6 +943,8 @@ private void writeGenericRecords(FileFormat fileFormat, Schema schema, List projectRecords(List records, Schema projectedSchema) { @@ -662,6 +969,26 @@ private static void assumeSupports(FileFormat fileFormat, String feature) { assumeThat(MISSING_FEATURES.getOrDefault(fileFormat, new String[] {})).doesNotContain(feature); } + /** + * Returns whether the given file format supports the specified feature. + * + *

The check is based on {@link #MISSING_FEATURES}. Features not listed as missing for a format + * are treated as supported. + * + *

Prefer this method over {@link #assumeSupports(FileFormat, String)} when only part of a test + * should be skipped conditionally. Unlike {@code assumeSupports}, this method does not abort the + * entire test via an assumption failure; it returns {@code false} so callers can skip only + * feature-specific assertions while still validating shared behavior. + * + * @param fileFormat the file format under test + * @param feature the feature name + * @return {@code true} if the feature is supported by the format; {@code false} otherwise + */ + private static boolean supportsFeature(FileFormat fileFormat, String feature) { + String[] missing = MISSING_FEATURES.getOrDefault(fileFormat, new String[] {}); + return !Arrays.asList(missing).contains(feature); + } + private DataFile writeRecordsForSplit(FileFormat fileFormat, Schema schema, List records) throws IOException { @@ -700,4 +1027,252 @@ private static String splitSizeProperty(FileFormat fileFormat) { "No split size property defined for format: " + fileFormat); }; } + + private static void assertCounts( + FileFormat fileFormat, Schema schema, List genericRecords, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + Map valueCounts = file.valueCounts(); + Map nullValueCounts = file.nullValueCounts(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(valueCounts).containsKey(field.fieldId()); + assertThat(nullValueCounts).containsKey(field.fieldId()); + + long nullCount = + genericRecords.stream().filter(r -> r.getField(field.name()) == null).count(); + + assertThat(valueCounts.get(field.fieldId())).isEqualTo(genericRecords.size()); + assertThat(nullValueCounts.get(field.fieldId())).isEqualTo(nullCount); + } + } + } + + private static void assertBounds( + FileFormat fileFormat, Schema schema, List genericRecords, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + Map lowerBounds = file.lowerBounds(); + Map upperBounds = file.upperBounds(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(lowerBounds).containsKey(field.fieldId()); + assertThat(upperBounds).containsKey(field.fieldId()); + + ByteBuffer lowerBuffer = lowerBounds.get(field.fieldId()); + ByteBuffer upperBuffer = upperBounds.get(field.fieldId()); + + Comparator cmp = Comparators.forType(field.type().asPrimitiveType()); + + Object[] minMax = computeMinMax(genericRecords, field, cmp); + Object expectedMin = minMax[0]; + Object expectedMax = minMax[1]; + + if (expectedMin != null) { + assertThat(lowerBuffer).isNotNull(); + Object actualLower = Conversions.fromByteBuffer(field.type(), lowerBuffer); + assertThat(cmp.compare(actualLower, expectedMin)).isEqualTo(0); + } + + if (expectedMax != null) { + assertThat(upperBuffer).isNotNull(); + Object actualUpper = Conversions.fromByteBuffer(field.type(), upperBuffer); + assertThat(cmp.compare(actualUpper, expectedMax)).isEqualTo(0); + } + } + } + } + + private static Object[] computeMinMax( + List records, Types.NestedField field, Comparator cmp) { + Object min = null; + Object max = null; + for (Record record : records) { + Object value = record.getField(field.name()); + if (value == null) { + continue; + } + + if (value instanceof Float && ((Float) value).isNaN()) { + continue; + } + + if (value instanceof Double && ((Double) value).isNaN()) { + continue; + } + + if (min == null || cmp.compare(value, min) < 0) { + min = value; + } + + if (max == null || cmp.compare(value, max) > 0) { + max = value; + } + } + + return new Object[] {min, max}; + } + + private static void assertBoundsNull(Schema schema, ContentFile file) { + Map lowerBounds = file.lowerBounds(); + Map upperBounds = file.upperBounds(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(lowerBounds == null || lowerBounds.get(field.fieldId()) == null).isTrue(); + assertThat(upperBounds == null || upperBounds.get(field.fieldId()) == null).isTrue(); + } + } + } + + private static void assertColumnSize(FileFormat fileFormat, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + assertThat(file.columnSizes()).isNotNull().isNotEmpty(); + } + + private static void assertColumnSizeEmpty(FileFormat fileFormat, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + assertThat(file.columnSizes()).isEmpty(); + } + + private static void assertCountsNull(Schema schema, ContentFile file) { + Map valueCounts = file.valueCounts(); + Map nullValueCounts = file.nullValueCounts(); + for (Types.NestedField field : schema.columns()) { + if (field.type().isPrimitiveType()) { + assertThat(valueCounts == null || valueCounts.get(field.fieldId()) == null).isTrue(); + assertThat(nullValueCounts == null || nullValueCounts.get(field.fieldId()) == null) + .isTrue(); + } + } + } + + private static void assertNanCounts( + FileFormat fileFormat, Schema schema, List records, ContentFile file) { + if (!supportsFeature(fileFormat, FEATURE_COLUMN_LEVEL_METRICS)) { + return; + } + + Map nanValueCounts = file.nanValueCounts(); + assertThat(nanValueCounts).isNotNull(); + + for (Types.NestedField field : schema.columns()) { + if (field.type().typeId() == Type.TypeID.FLOAT + || field.type().typeId() == Type.TypeID.DOUBLE) { + long expectedNanCount = + records.stream() + .map(r -> r.getField(field.name())) + .filter( + v -> + (v instanceof Float && ((Float) v).isNaN()) + || (v instanceof Double && ((Double) v).isNaN())) + .count(); + assertThat(nanValueCounts.get(field.fieldId())).isEqualTo(expectedNanCount); + } + } + } + + private DeleteFile writePositionDeletes(FileFormat fileFormat, List> deletes) + throws IOException { + FileWriterBuilder, ?> writerBuilder = + FormatModelRegistry.positionDeleteWriteBuilder(fileFormat, encryptedFile); + + PositionDeleteWriter writer = writerBuilder.spec(PartitionSpec.unpartitioned()).build(); + try (writer) { + deletes.forEach(writer::write); + } + + return writer.toDeleteFile(); + } + + private void assertPositionDeleteMetrics( + FileFormat fileFormat, + List> deletes, + DeleteFile deleteFile, + boolean checkBounds) { + Schema positionDeleteSchema = DeleteSchemaUtil.pathPosSchema(); + + assertThat(deleteFile).isNotNull(); + assertThat(deleteFile.recordCount()).isEqualTo(deletes.size()); + assertCountsNull(positionDeleteSchema, deleteFile); + + assumeSupports(fileFormat, FEATURE_COLUMN_LEVEL_METRICS); + + if (checkBounds) { + // Single file reference: bounds are preserved + List genericRecords = + deletes.stream() + .map( + d -> + GenericRecord.create(positionDeleteSchema) + .copy( + DELETE_FILE_PATH.name(), d.path(), + DELETE_FILE_POS.name(), d.pos())) + .toList(); + assertBounds(fileFormat, positionDeleteSchema, genericRecords, deleteFile); + } else { + // Multiple file references: bounds are also removed + assertBoundsNull(positionDeleteSchema, deleteFile); + } + } + + private MetricsConfig config(Schema schema, MetricsMode defaultMode) { + return config(schema, defaultMode, ImmutableMap.of()); + } + + private MetricsConfig config( + Schema schema, MetricsMode defaultMode, Map columnModes) { + ImmutableMap.Builder properties = ImmutableMap.builder(); + properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, defaultMode.toString()); + columnModes.forEach( + (column, mode) -> + properties.put( + TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + column, mode.toString())); + + TestTables.TestTable table = + TestTables.create( + tableDir, "test", schema, PartitionSpec.unpartitioned(), 3, properties.build()); + + return MetricsConfig.forTable(table); + } + + private void assertTruncateBoundsForFirstColumn( + FileFormat fileFormat, + Schema schema, + List records, + int truncateLength, + String requiredFeature, + BiConsumer boundsAssertion) + throws IOException { + MetricsConfig truncateConfig = config(schema, MetricsModes.Truncate.withLength(truncateLength)); + + DataFile dataFile = writeGenericRecords(fileFormat, schema, records, truncateConfig); + assertCounts(fileFormat, schema, records, dataFile); + + if (!supportsFeature(fileFormat, requiredFeature)) { + return; + } + + Map lowerBounds = dataFile.lowerBounds(); + Map upperBounds = dataFile.upperBounds(); + + assertThat(lowerBounds).containsKey(1); + assertThat(upperBounds).containsKey(1); + + boundsAssertion.accept(lowerBounds.get(1), upperBounds.get(1)); + + Schema intSchema = new Schema(schema.findField("col_int")); + assertBounds(fileFormat, intSchema, records, dataFile); + + assertThat(dataFile.columnSizes()).isNotNull().isNotEmpty(); + } } diff --git a/data/src/test/java/org/apache/iceberg/data/DataGenerators.java b/data/src/test/java/org/apache/iceberg/data/DataGenerators.java index 325a8b191b07..390c0949cb72 100644 --- a/data/src/test/java/org/apache/iceberg/data/DataGenerators.java +++ b/data/src/test/java/org/apache/iceberg/data/DataGenerators.java @@ -64,4 +64,16 @@ public Schema schema() { return schema; } } + + static class FloatDoubleSchema implements DataGenerator { + private final Schema schema = + new Schema( + Types.NestedField.required(1, "col_float", Types.FloatType.get()), + Types.NestedField.required(2, "col_double", Types.DoubleType.get())); + + @Override + public Schema schema() { + return schema; + } + } } From f475ccb5c579b8a1d4842507b12e854161dea7af Mon Sep 17 00:00:00 2001 From: Denys Kuzmenko Date: Fri, 24 Apr 2026 16:49:05 +0300 Subject: [PATCH 094/197] ORC: Fix connection leak in OrcIterable (#16086) --- .../org/apache/iceberg/orc/OrcIterable.java | 2 + .../orc/TestOrcIterableResourceCleanup.java | 133 ++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 orc/src/test/java/org/apache/iceberg/orc/TestOrcIterableResourceCleanup.java diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java index 119b3c54f278..0f65f1b65d9c 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcIterable.java @@ -103,6 +103,8 @@ public CloseableIterator iterator() { VectorizedRowBatchIterator rowBatchIterator = newOrcIterator(file, readOrcSchema, start, length, orcFileReader, sarg, recordsPerBatch); + addCloseable(rowBatchIterator); + if (batchReaderFunction != null) { OrcBatchReader batchReader = (OrcBatchReader) batchReaderFunction.apply(readOrcSchema); return CloseableIterator.transform( diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestOrcIterableResourceCleanup.java b/orc/src/test/java/org/apache/iceberg/orc/TestOrcIterableResourceCleanup.java new file mode 100644 index 000000000000..6e819af5574b --- /dev/null +++ b/orc/src/test/java/org/apache/iceberg/orc/TestOrcIterableResourceCleanup.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.orc; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.orc.GenericOrcReader; +import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.SeekableInputStream; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.Mockito; + +public class TestOrcIterableResourceCleanup { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + @TempDir private File temp; + + @Test + public void testClosingIterableClosesAllStreams() throws IOException { + List inputStreams = Lists.newArrayList(); + InputFile inputFile = spyOnStreams(writeTestOrcFile(), inputStreams); + + try (CloseableIterable iterable = newOrcIterable(inputFile)) { + try (CloseableIterator iterator = iterable.iterator()) { + drain(iterator); + } + } + + verifyAllStreamsClosed(inputStreams); + } + + @Test + public void testClosingIterableClosesIteratorResources() throws IOException { + List inputStreams = Lists.newArrayList(); + InputFile inputFile = spyOnStreams(writeTestOrcFile(), inputStreams); + + // Without addCloseable(rowBatchIterator) in OrcIterable, the VectorizedRowBatchIterator + // and its RecordReader are never closed, leaking ORC input streams / file handles. + for (int round = 0; round < 5; round++) { + try (CloseableIterable iterable = newOrcIterable(inputFile)) { + drain(iterable.iterator()); + } + } + + verifyAllStreamsClosed(inputStreams); + } + + private static void drain(CloseableIterator iterator) { + while (iterator.hasNext()) { + iterator.next(); + } + } + + private InputFile writeTestOrcFile() throws IOException { + OutputFile outputFile = Files.localOutput(File.createTempFile("test", ".orc", temp)); + try (DataWriter writer = + ORC.writeData(outputFile) + .schema(SCHEMA) + .createWriterFunc(GenericOrcWriter::buildWriter) + .overwrite() + .withSpec(PartitionSpec.unpartitioned()) + .build()) { + GenericRecord record = GenericRecord.create(SCHEMA); + for (int i = 0; i < 10; i++) { + writer.write(record.copy(ImmutableMap.of("id", (long) i, "data", "val" + i))); + } + } + + return outputFile.toInputFile(); + } + + private static CloseableIterable newOrcIterable(InputFile input) { + return ORC.read(input) + .project(SCHEMA) + .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(SCHEMA, fileSchema)) + .build(); + } + + private static void verifyAllStreamsClosed(List streams) throws IOException { + for (SeekableInputStream stream : streams) { + Mockito.verify(stream, Mockito.times(1)).close(); + } + } + + private static InputFile spyOnStreams(InputFile delegate, List streams) { + InputFile inputFile = Mockito.spy(delegate); + Mockito.doAnswer( + invocation -> { + SeekableInputStream real = (SeekableInputStream) invocation.callRealMethod(); + SeekableInputStream inputStream = Mockito.spy(real); + streams.add(inputStream); + return inputStream; + }) + .when(inputFile) + .newStream(); + return inputFile; + } +} From 57b7c2bd564e5dfa8ccb7cb9e6386e7e942bac0f Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Fri, 24 Apr 2026 09:16:06 -0700 Subject: [PATCH 095/197] API: Use column bounds to evaluate startsWith in StrictMetricsEvaluator (#15902) --- .../expressions/StrictMetricsEvaluator.java | 36 ++++++++ .../TestStrictMetricsEvaluator.java | 91 +++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java index 5d981e7ed139..f57ba8bc2793 100644 --- a/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java +++ b/api/src/main/java/org/apache/iceberg/expressions/StrictMetricsEvaluator.java @@ -464,6 +464,42 @@ public Boolean notIn(BoundReference ref, Set literalSet) { @Override public Boolean startsWith(BoundReference ref, Literal lit) { + int id = ref.fieldId(); + if (isNestedColumn(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (canContainNulls(id)) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (lowerBounds != null + && lowerBounds.containsKey(id) + && upperBounds != null + && upperBounds.containsKey(id)) { + String prefix = (String) lit.value(); + Comparator comparator = Comparators.charSequences(); + CharSequence lower = Conversions.fromByteBuffer(ref.type(), lowerBounds.get(id)); + CharSequence upper = Conversions.fromByteBuffer(ref.type(), upperBounds.get(id)); + + // if lower is shorter than the prefix then lower doesn't start with the prefix + if (lower.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(lower.subSequence(0, prefix.length()), prefix) == 0) { + // if upper is shorter than the prefix then upper can't start with the prefix + if (upper.length() < prefix.length()) { + return ROWS_MIGHT_NOT_MATCH; + } + + if (comparator.compare(upper.subSequence(0, prefix.length()), prefix) == 0) { + // both bounds start with the prefix, so all rows must start with the prefix + return ROWS_MUST_MATCH; + } + } + } + return ROWS_MIGHT_NOT_MATCH; } diff --git a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java index 99800f5171ba..b55f4efb1726 100644 --- a/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java +++ b/api/src/test/java/org/apache/iceberg/expressions/TestStrictMetricsEvaluator.java @@ -34,6 +34,7 @@ import static org.apache.iceberg.expressions.Expressions.notNull; import static org.apache.iceberg.expressions.Expressions.notStartsWith; import static org.apache.iceberg.expressions.Expressions.or; +import static org.apache.iceberg.expressions.Expressions.startsWith; import static org.apache.iceberg.types.Conversions.toByteBuffer; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; @@ -774,6 +775,57 @@ public void testNotStartsWithNoStats() { assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); } + @Test + void testStartsWithBothBoundsMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "ab")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: both bounds start with the prefix").isTrue(); + } + + @Test + void testStartsWithSingleCharPrefixBothBoundsMatch() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should match: both bounds start with the single char prefix") + .isTrue(); + } + + @Test + void testStartsWithOnlyLowerBoundMatchesPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abc")).eval(STRING_FILE); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + } + + @Test + void testStartsWithBoundsDoNotMatchPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "zzz")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithWiderRange() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(STRING_FILE_2); + assertThat(shouldRead) + .as("Should not match: upper bound does not start with the prefix") + .isFalse(); + + shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "e")).eval(STRING_FILE_2); + assertThat(shouldRead).as("Should not match: no bounds start with the prefix").isFalse(); + } + + @Test + void testStartsWithNoStats() { + boolean shouldRead = new StrictMetricsEvaluator(SCHEMA, startsWith("required", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: no bounds available for column").isFalse(); + } + @Test public void testNotStartsWithSomeNullsBoundsOutsidePrefix() { boolean shouldRead = @@ -830,4 +882,43 @@ public void testNotStartsWithNestedColumn() { .eval(FILE); assertThat(shouldRead).as("notStartsWith nested column should not match").isFalse(); } + + @Test + void testStartsWithAllNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("all_nulls", "a")).eval(FILE); + assertThat(shouldRead) + .as("Should not match: all null values do not satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithSomeNulls() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("some_nulls", "b")).eval(FILE_2); + assertThat(shouldRead) + .as("Should not match: some nulls means not all rows can satisfy startsWith") + .isFalse(); + } + + @Test + void testStartsWithPrefixLongerThanBounds() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "abcdef")).eval(STRING_FILE); + assertThat(shouldRead).as("Should not match: prefix is longer than the bounds").isFalse(); + } + + @Test + void testStartsWithEmptyPrefix() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("required", "")).eval(STRING_FILE); + assertThat(shouldRead).as("Should match: all strings start with empty prefix").isTrue(); + } + + @Test + void testStartsWithNestedColumn() { + boolean shouldRead = + new StrictMetricsEvaluator(SCHEMA, startsWith("struct.nested_string_col", "a")).eval(FILE); + assertThat(shouldRead).as("Should not match: nested column is not supported").isFalse(); + } } From 8c217a35b0eb78f04b4127e38eee0683bb6f1931 Mon Sep 17 00:00:00 2001 From: Chase Zhang Date: Sat, 25 Apr 2026 04:28:41 +0800 Subject: [PATCH 096/197] Flink: Fix watermark value which should be min timestamp minus one (#15884) --- .../WatermarkExtractorRecordEmitter.java | 7 +- ...stIcebergSourceWithWatermarkExtractor.java | 62 +++++++ .../TestWatermarkExtractorRecordEmitter.java | 164 ++++++++++++++++++ .../WatermarkExtractorRecordEmitter.java | 7 +- ...stIcebergSourceWithWatermarkExtractor.java | 62 +++++++ .../TestWatermarkExtractorRecordEmitter.java | 164 ++++++++++++++++++ .../WatermarkExtractorRecordEmitter.java | 7 +- ...stIcebergSourceWithWatermarkExtractor.java | 62 +++++++ .../TestWatermarkExtractorRecordEmitter.java | 164 ++++++++++++++++++ 9 files changed, 693 insertions(+), 6 deletions(-) create mode 100644 flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java create mode 100644 flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java index 02ef57d344b1..3af9957875e8 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -34,7 +34,7 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); private final SplitWatermarkExtractor timeExtractor; private String lastSplitId = null; - private long watermark; + private long watermark = Long.MIN_VALUE; WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { this.timeExtractor = timeExtractor; @@ -44,7 +44,10 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter public void emitRecord( RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); + long extracted = timeExtractor.extractWatermark(split); + // Subtract 1 because watermark W means all records with eventTime <= W have arrived; + // records in this split have eventTime == extracted, so watermark must be extracted - 1. + long newWatermark = extracted > Long.MIN_VALUE ? extracted - 1 : Long.MIN_VALUE; if (newWatermark < watermark) { LOG.info( "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index 70889f4f76aa..fff9b96b3b7c 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -225,6 +225,68 @@ public void apply( 3))); } + /** + * Integration test verifying that records with eventTime equal to the minimum timestamp of their + * split are correctly included in windows. The {@link + * org.apache.iceberg.flink.source.reader.WatermarkExtractorRecordEmitter} emits the watermark as + * {@code minSplitTs - 1}, so records at exactly {@code minSplitTs} are on-time rather than late. + * + *

The test writes 3 records at epoch (t=0). The split's column-stats lower-bound is 0, so the + * extracted watermark is 0ms and the emitted watermark is -1ms. Records at t=0 are strictly after + * that watermark and therefore belong to the [0, 5min) window. A later split is then appended to + * advance the watermark past the window boundary and trigger its evaluation. + */ + @Test + public void testWindowingWithRecordsAtSplitMinTimestamp() throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // File 1: 3 records at exactly t=0 (epoch). Extracted watermark = 0ms, emitted = -1ms. + List batch = + ImmutableList.of( + generateRecord(0, "file_1-recordTs_0_a"), + generateRecord(0, "file_1-recordTs_0_b"), + generateRecord(0, "file_1-recordTs_0_c")); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Time.minutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + WINDOWS.clear(); + env.executeAsync("Iceberg Source Min Timestamp Windowing Test"); + + // Append a file with much later timestamps to advance the watermark past [0, 5min) + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // The [0, 5min) window should fire with all 3 records written at epoch + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> Integer.valueOf(3).equals(WINDOWS.get(0L))); + } + /** * This is an integration test for watermark handling and throttling. Integration testing the * following: diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..1cb7be03c6a7 --- /dev/null +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestWatermarkExtractorRecordEmitter { + @TempDir protected Path temporaryFolder; + + @Test + public void testWatermarkIsDecrementedByOne() throws IOException { + long extractedWatermark = 1000L; + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> extractedWatermark); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(extractedWatermark - 1); + } + + @Test + public void testWatermarkEmittedOnlyOncePerSplit() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> 1000L); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.records).hasSize(3); + } + + @Test + public void testWatermarkNotEmittedWhenNewSplitHasLowerValue() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 2000L); + watermarkMap.put(split2.splitId(), 1000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + // Only split1's watermark is emitted; split2 has a lower value so it's skipped + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkEmittedForEachHigherSplit() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 1000L); + watermarkMap.put(split2.splitId(), 2000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + assertThat(output.watermarks).hasSize(2); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(999L); + assertThat(output.watermarks.get(1).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkAtLongMinValueDoesNotOverflow() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> Long.MIN_VALUE); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(Long.MIN_VALUE); + } + + private IcebergSourceSplit createSplit(long seed) throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, TestFixtures.SCHEMA)); + } + + private static class CapturingSourceOutput implements SourceOutput { + final List watermarks = Lists.newArrayList(); + final List records = Lists.newArrayList(); + + @Override + public void collect(T record) { + records.add(record); + } + + @Override + public void collect(T record, long timestamp) { + records.add(record); + } + + @Override + public void emitWatermark(Watermark watermark) { + watermarks.add(watermark); + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java index 02ef57d344b1..3af9957875e8 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -34,7 +34,7 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); private final SplitWatermarkExtractor timeExtractor; private String lastSplitId = null; - private long watermark; + private long watermark = Long.MIN_VALUE; WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { this.timeExtractor = timeExtractor; @@ -44,7 +44,10 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter public void emitRecord( RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); + long extracted = timeExtractor.extractWatermark(split); + // Subtract 1 because watermark W means all records with eventTime <= W have arrived; + // records in this split have eventTime == extracted, so watermark must be extracted - 1. + long newWatermark = extracted > Long.MIN_VALUE ? extracted - 1 : Long.MIN_VALUE; if (newWatermark < watermark) { LOG.info( "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index f84cf7fb1aae..ec9333674d03 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -225,6 +225,68 @@ public void apply( 3))); } + /** + * Integration test verifying that records with eventTime equal to the minimum timestamp of their + * split are correctly included in windows. The {@link + * org.apache.iceberg.flink.source.reader.WatermarkExtractorRecordEmitter} emits the watermark as + * {@code minSplitTs - 1}, so records at exactly {@code minSplitTs} are on-time rather than late. + * + *

The test writes 3 records at epoch (t=0). The split's column-stats lower-bound is 0, so the + * extracted watermark is 0ms and the emitted watermark is -1ms. Records at t=0 are strictly after + * that watermark and therefore belong to the [0, 5min) window. A later split is then appended to + * advance the watermark past the window boundary and trigger its evaluation. + */ + @Test + public void testWindowingWithRecordsAtSplitMinTimestamp() throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // File 1: 3 records at exactly t=0 (epoch). Extracted watermark = 0ms, emitted = -1ms. + List batch = + ImmutableList.of( + generateRecord(0, "file_1-recordTs_0_a"), + generateRecord(0, "file_1-recordTs_0_b"), + generateRecord(0, "file_1-recordTs_0_c")); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Duration.ofMinutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + WINDOWS.clear(); + env.executeAsync("Iceberg Source Min Timestamp Windowing Test"); + + // Append a file with much later timestamps to advance the watermark past [0, 5min) + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // The [0, 5min) window should fire with all 3 records written at epoch + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> Integer.valueOf(3).equals(WINDOWS.get(0L))); + } + /** * This is an integration test for watermark handling and throttling. Integration testing the * following: diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..1cb7be03c6a7 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestWatermarkExtractorRecordEmitter { + @TempDir protected Path temporaryFolder; + + @Test + public void testWatermarkIsDecrementedByOne() throws IOException { + long extractedWatermark = 1000L; + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> extractedWatermark); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(extractedWatermark - 1); + } + + @Test + public void testWatermarkEmittedOnlyOncePerSplit() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> 1000L); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.records).hasSize(3); + } + + @Test + public void testWatermarkNotEmittedWhenNewSplitHasLowerValue() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 2000L); + watermarkMap.put(split2.splitId(), 1000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + // Only split1's watermark is emitted; split2 has a lower value so it's skipped + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkEmittedForEachHigherSplit() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 1000L); + watermarkMap.put(split2.splitId(), 2000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + assertThat(output.watermarks).hasSize(2); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(999L); + assertThat(output.watermarks.get(1).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkAtLongMinValueDoesNotOverflow() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> Long.MIN_VALUE); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(Long.MIN_VALUE); + } + + private IcebergSourceSplit createSplit(long seed) throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, TestFixtures.SCHEMA)); + } + + private static class CapturingSourceOutput implements SourceOutput { + final List watermarks = Lists.newArrayList(); + final List records = Lists.newArrayList(); + + @Override + public void collect(T record) { + records.add(record); + } + + @Override + public void collect(T record, long timestamp) { + records.add(record); + } + + @Override + public void emitWatermark(Watermark watermark) { + watermarks.add(watermark); + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java index 02ef57d344b1..3af9957875e8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/WatermarkExtractorRecordEmitter.java @@ -34,7 +34,7 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter private static final Logger LOG = LoggerFactory.getLogger(WatermarkExtractorRecordEmitter.class); private final SplitWatermarkExtractor timeExtractor; private String lastSplitId = null; - private long watermark; + private long watermark = Long.MIN_VALUE; WatermarkExtractorRecordEmitter(SplitWatermarkExtractor timeExtractor) { this.timeExtractor = timeExtractor; @@ -44,7 +44,10 @@ class WatermarkExtractorRecordEmitter implements SerializableRecordEmitter public void emitRecord( RecordAndPosition element, SourceOutput output, IcebergSourceSplit split) { if (!split.splitId().equals(lastSplitId)) { - long newWatermark = timeExtractor.extractWatermark(split); + long extracted = timeExtractor.extractWatermark(split); + // Subtract 1 because watermark W means all records with eventTime <= W have arrived; + // records in this split have eventTime == extracted, so watermark must be extracted - 1. + long newWatermark = extracted > Long.MIN_VALUE ? extracted - 1 : Long.MIN_VALUE; if (newWatermark < watermark) { LOG.info( "Received a new split with lower watermark. Previous watermark = {}, current watermark = {}, previous split = {}, current split = {}", diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java index f84cf7fb1aae..ec9333674d03 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/TestIcebergSourceWithWatermarkExtractor.java @@ -225,6 +225,68 @@ public void apply( 3))); } + /** + * Integration test verifying that records with eventTime equal to the minimum timestamp of their + * split are correctly included in windows. The {@link + * org.apache.iceberg.flink.source.reader.WatermarkExtractorRecordEmitter} emits the watermark as + * {@code minSplitTs - 1}, so records at exactly {@code minSplitTs} are on-time rather than late. + * + *

The test writes 3 records at epoch (t=0). The split's column-stats lower-bound is 0, so the + * extracted watermark is 0ms and the emitted watermark is -1ms. Records at t=0 are strictly after + * that watermark and therefore belong to the [0, 5min) window. A later split is then appended to + * advance the watermark past the window boundary and trigger its evaluation. + */ + @Test + public void testWindowingWithRecordsAtSplitMinTimestamp() throws Exception { + GenericAppenderHelper dataAppender = appender(); + + // File 1: 3 records at exactly t=0 (epoch). Extracted watermark = 0ms, emitted = -1ms. + List batch = + ImmutableList.of( + generateRecord(0, "file_1-recordTs_0_a"), + generateRecord(0, "file_1-recordTs_0_b"), + generateRecord(0, "file_1-recordTs_0_c")); + dataAppender.appendToTable(batch); + + StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + env.setParallelism(1); + + DataStream stream = + env.fromSource( + source(), + WatermarkStrategy.noWatermarks() + .withTimestampAssigner(new RowDataTimestampAssigner()), + SOURCE_NAME, + TypeInformation.of(RowData.class)); + + stream + .windowAll(TumblingEventTimeWindows.of(Duration.ofMinutes(5))) + .apply( + new AllWindowFunction() { + @Override + public void apply( + TimeWindow window, Iterable values, Collector out) { + AtomicInteger count = new AtomicInteger(0); + values.forEach(a -> count.incrementAndGet()); + out.collect(row(window.getStart(), count.get())); + WINDOWS.put(window.getStart(), count.get()); + } + }); + + WINDOWS.clear(); + env.executeAsync("Iceberg Source Min Timestamp Windowing Test"); + + // Append a file with much later timestamps to advance the watermark past [0, 5min) + dataAppender.appendToTable( + dataAppender.writeFile(ImmutableList.of(generateRecord(1500, "last-record")))); + + // The [0, 5min) window should fire with all 3 records written at epoch + Awaitility.await() + .pollInterval(Duration.ofMillis(10)) + .atMost(30, TimeUnit.SECONDS) + .until(() -> Integer.valueOf(3).equals(WINDOWS.get(0L))); + } + /** * This is an integration test for watermark handling and throttling. Integration testing the * following: diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java new file mode 100644 index 000000000000..1cb7be03c6a7 --- /dev/null +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/source/reader/TestWatermarkExtractorRecordEmitter.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.source.reader; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import org.apache.flink.api.common.eventtime.Watermark; +import org.apache.flink.api.connector.source.SourceOutput; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.flink.TestFixtures; +import org.apache.iceberg.flink.source.split.IcebergSourceSplit; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestWatermarkExtractorRecordEmitter { + @TempDir protected Path temporaryFolder; + + @Test + public void testWatermarkIsDecrementedByOne() throws IOException { + long extractedWatermark = 1000L; + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> extractedWatermark); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(extractedWatermark - 1); + } + + @Test + public void testWatermarkEmittedOnlyOncePerSplit() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> 1000L); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + emitter.emitRecord(element, output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.records).hasSize(3); + } + + @Test + public void testWatermarkNotEmittedWhenNewSplitHasLowerValue() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 2000L); + watermarkMap.put(split2.splitId(), 1000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + // Only split1's watermark is emitted; split2 has a lower value so it's skipped + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkEmittedForEachHigherSplit() throws IOException { + IcebergSourceSplit split1 = createSplit(1L); + IcebergSourceSplit split2 = createSplit(2L); + + Map watermarkMap = Maps.newHashMap(); + watermarkMap.put(split1.splitId(), 1000L); + watermarkMap.put(split2.splitId(), 2000L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> watermarkMap.get(s.splitId())); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + RecordAndPosition element = new RecordAndPosition<>("record", 0, 0L); + emitter.emitRecord(element, output, split1); + emitter.emitRecord(element, output, split2); + + assertThat(output.watermarks).hasSize(2); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(999L); + assertThat(output.watermarks.get(1).getTimestamp()).isEqualTo(1999L); + } + + @Test + public void testWatermarkAtLongMinValueDoesNotOverflow() throws IOException { + IcebergSourceSplit split = createSplit(1L); + + WatermarkExtractorRecordEmitter emitter = + new WatermarkExtractorRecordEmitter<>(s -> Long.MIN_VALUE); + + CapturingSourceOutput output = new CapturingSourceOutput<>(); + emitter.emitRecord(new RecordAndPosition<>("record", 0, 0L), output, split); + + assertThat(output.watermarks).hasSize(1); + assertThat(output.watermarks.get(0).getTimestamp()).isEqualTo(Long.MIN_VALUE); + } + + private IcebergSourceSplit createSplit(long seed) throws IOException { + List> recordBatchList = + ReaderUtil.createRecordBatchList(seed, TestFixtures.SCHEMA, 1, 1); + return IcebergSourceSplit.fromCombinedScanTask( + ReaderUtil.createCombinedScanTask( + recordBatchList, temporaryFolder, FileFormat.PARQUET, TestFixtures.SCHEMA)); + } + + private static class CapturingSourceOutput implements SourceOutput { + final List watermarks = Lists.newArrayList(); + final List records = Lists.newArrayList(); + + @Override + public void collect(T record) { + records.add(record); + } + + @Override + public void collect(T record, long timestamp) { + records.add(record); + } + + @Override + public void emitWatermark(Watermark watermark) { + watermarks.add(watermark); + } + + @Override + public void markIdle() {} + + @Override + public void markActive() {} + } +} From 03347ff6c971d2785dbd12956f27a2627c9558fe Mon Sep 17 00:00:00 2001 From: GuoYu <511955993@qq.com> Date: Sat, 25 Apr 2026 04:34:56 +0800 Subject: [PATCH 097/197] Data: Add TCK tests for Metadata Columns in BaseFormatModelTests (#15675) --- .../iceberg/data/BaseFormatModelTests.java | 534 +++++++++++++++++- .../flink/data/TestFlinkFormatModel.java | 26 + .../flink/data/TestFlinkFormatModel.java | 26 + .../flink/data/TestFlinkFormatModel.java | 26 + .../spark/data/TestSparkFormatModel.java | 7 + .../spark/data/TestSparkFormatModel.java | 7 + .../spark/data/TestSparkFormatModel.java | 7 + .../spark/data/TestSparkFormatModel.java | 7 + 8 files changed, 624 insertions(+), 16 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java index 8a47132975be..d0b8e3161bdf 100644 --- a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java +++ b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java @@ -35,16 +35,21 @@ import java.util.Map; import java.util.UUID; import java.util.function.BiConsumer; +import java.util.function.BiFunction; +import java.util.function.Function; import java.util.stream.IntStream; import org.apache.iceberg.ContentFile; import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.MetricsModes; import org.apache.iceberg.MetricsModes.MetricsMode; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; +import org.apache.iceberg.StructLike; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TestTables; import org.apache.iceberg.deletes.EqualityDeleteWriter; @@ -57,6 +62,7 @@ import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.expressions.Literal; import org.apache.iceberg.formats.FileWriterBuilder; import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.inmemory.InMemoryFileIO; @@ -88,6 +94,8 @@ public abstract class BaseFormatModelTests { protected abstract void assertEquals(Schema schema, List expected, List actual); + protected abstract Object convertConstantToEngine(Type type, Object value); + protected boolean supportsBatchReads() { return false; } @@ -108,7 +116,9 @@ protected boolean supportsBatchReads() { static final String FEATURE_FILTER = "filter"; static final String FEATURE_CASE_SENSITIVE = "caseSensitive"; static final String FEATURE_SPLIT = "split"; + static final String FEATURE_READER_DEFAULT = "readerDefault"; static final String FEATURE_REUSE_CONTAINERS = "reuseContainers"; + static final String FEATURE_META_ROW_LINEAGE = "metaRowLineage"; static final String FEATURE_COLUMN_LEVEL_METRICS = "columnLevelMetrics"; static final String FEATURE_COLUMN_METRICS_TRUNCATE_BINARY = "columnMetricsTruncateBinary"; @@ -123,7 +133,12 @@ protected boolean supportsBatchReads() { FEATURE_COLUMN_METRICS_TRUNCATE_BINARY }, FileFormat.ORC, - new String[] {FEATURE_REUSE_CONTAINERS, FEATURE_COLUMN_METRICS_TRUNCATE_BINARY}); + new String[] { + FEATURE_REUSE_CONTAINERS, + FEATURE_COLUMN_METRICS_TRUNCATE_BINARY, + FEATURE_META_ROW_LINEAGE, + FEATURE_READER_DEFAULT + }); private InMemoryFileIO fileIO; private EncryptedOutputFile encryptedFile; @@ -395,7 +410,10 @@ void testReaderBuilderProjection(FileFormat fileFormat) throws IOException { List genericRecords = dataGenerator.generateRecords(); writeGenericRecords(fileFormat, fullSchema, genericRecords); - List projectedGenericRecords = projectRecords(genericRecords, projectedSchema); + List projectedGenericRecords = + genericRecords.stream() + .map(record -> copy(record, projectedSchema, projectedSchema)) + .toList(); List expectedEngineRecords = convertToEngineRecords(projectedGenericRecords, projectedSchema); @@ -617,6 +635,50 @@ void testReaderBuilderReuseContainers(FileFormat fileFormat) throws IOException reuseRecords.forEach(r -> assertThat(r).isSameAs(reuseRecords.get(0))); } + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReaderSchemaEvolutionNewColumnWithDefault(FileFormat fileFormat) throws IOException { + + assumeSupports(fileFormat, FEATURE_READER_DEFAULT); + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + String defaultStringValue = "default_value"; + int defaultIntValue = 42; + + int maxFieldId = + writeSchema.columns().stream().mapToInt(Types.NestedField::fieldId).max().orElse(0); + + List evolvedColumns = Lists.newArrayList(writeSchema.columns()); + evolvedColumns.add( + Types.NestedField.required("col_f") + .withId(maxFieldId + 1) + .ofType(Types.StringType.get()) + .withInitialDefault(Literal.of(defaultStringValue)) + .build()); + evolvedColumns.add( + Types.NestedField.optional("col_g") + .withId(maxFieldId + 2) + .ofType(Types.IntegerType.get()) + .withInitialDefault(Literal.of(defaultIntValue)) + .build()); + + Schema evolvedSchema = new Schema(evolvedColumns); + readAndAssertGenericRecords( + fileFormat, + evolvedSchema, + genericRecords, + record -> { + Record expected = copy(record, writeSchema, evolvedSchema); + expected.setField("col_f", defaultStringValue); + expected.setField("col_g", defaultIntValue); + return expected; + }); + } + @ParameterizedTest @FieldSource("FILE_FORMATS") void testReaderBuilderRecordsPerBatchNotSupported(FileFormat fileFormat) throws IOException { @@ -903,6 +965,371 @@ void testDataWriterNegativeZeroBounds(FileFormat fileFormat) throws IOException assertBounds(fileFormat, schema, records, dataFile); } + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnFilePath(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + String filePath = "test-data-file.parquet"; + Schema projectionSchema = new Schema(MetadataColumns.FILE_PATH); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.FILE_PATH.fieldId(), filePath); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + genericRecords, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.FILE_PATH.name(), filePath)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnSpecId(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + int specId = 0; + Schema projectionSchema = new Schema(MetadataColumns.SPEC_ID); + + Map idToConstant = ImmutableMap.of(MetadataColumns.SPEC_ID.fieldId(), specId); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + genericRecords, + ignored -> + GenericRecord.create(projectionSchema).copy(MetadataColumns.SPEC_ID.name(), specId)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnRowPosition(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + Schema projectionSchema = new Schema(MetadataColumns.ROW_POSITION); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + null, + genericRecords, + (position, ignored) -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.ROW_POSITION.name(), (long) position)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnIsDeleted(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + Schema projectionSchema = new Schema(MetadataColumns.IS_DELETED); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + null, + genericRecords, + ignored -> + GenericRecord.create(projectionSchema).copy(MetadataColumns.IS_DELETED.name(), false)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnRowLinage(FileFormat fileFormat) throws IOException { + assumeSupports(fileFormat, FEATURE_META_ROW_LINEAGE); + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema schema = dataGenerator.schema(); + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, schema, genericRecords); + + long baseRowId = 100L; + long fileSeqNumber = 5L; + Schema projectionSchema = + new Schema(MetadataColumns.ROW_ID, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER); + + Map idToConstant = + ImmutableMap.of( + MetadataColumns.ROW_ID.fieldId(), baseRowId, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), fileSeqNumber); + + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + genericRecords, + (position, ignored) -> + GenericRecord.create(projectionSchema) + .copy( + MetadataColumns.ROW_ID.name(), + baseRowId + position, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), + fileSeqNumber)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnRowLinageExistValue(FileFormat fileFormat) throws IOException { + assumeSupports(fileFormat, FEATURE_META_ROW_LINEAGE); + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema dataSchema = dataGenerator.schema(); + + Schema writeSchema = MetadataColumns.schemaWithRowLineage(dataSchema); + + List baseRecords = dataGenerator.generateRecords(); + List writeRecords = Lists.newArrayListWithExpectedSize(baseRecords.size()); + for (int i = 0; i < baseRecords.size(); i++) { + Record base = baseRecords.get(i); + Record rec = copy(base, dataSchema, writeSchema); + + if (i % 2 == 0) { + rec.setField(MetadataColumns.ROW_ID.name(), 555L + i); + rec.setField(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), 7L); + } else { + rec.setField(MetadataColumns.ROW_ID.name(), null); + rec.setField(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), null); + } + + writeRecords.add(rec); + } + + DataWriter writer = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(writeSchema) + .spec(PartitionSpec.unpartitioned()) + .build(); + + try (writer) { + writeRecords.forEach(writer::write); + } + + long baseRowId = 100L; + long fileSeqNumber = 5L; + Schema projectionSchema = + new Schema(MetadataColumns.ROW_ID, MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER); + + Map idToConstant = + ImmutableMap.of( + MetadataColumns.ROW_ID.fieldId(), baseRowId, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId(), fileSeqNumber); + + // Expected results: + // - Even rows (explicit values): _row_id = 555+i, _last_updated_sequence_number = 7 + // - Odd rows (null values): _row_id = baseRowId+pos, _last_updated_sequence_number = + // fileSeqNumber + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + baseRecords, + (position, ignored) -> { + if (position % 2 == 0) { + return GenericRecord.create(projectionSchema) + .copy( + MetadataColumns.ROW_ID.name(), + 555L + position, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), + 7L); + } else { + return GenericRecord.create(projectionSchema) + .copy( + MetadataColumns.ROW_ID.name(), + baseRowId + position, + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.name(), + fileSeqNumber); + } + }); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnPartitionIdentity(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + PartitionSpec spec = PartitionSpec.builderFor(dataGenerator.schema()).identity("col_a").build(); + + Types.StructType partitionType = spec.partitionType(); + PartitionData partitionData = new PartitionData(partitionType); + partitionData.set(0, "test_col_a"); + + DataWriter writer = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(dataGenerator.schema()) + .spec(PartitionSpec.unpartitioned()) + .build(); + + List records = dataGenerator.generateRecords(); + try (writer) { + records.forEach(writer::write); + } + + Types.NestedField partitionField = + Types.NestedField.optional( + MetadataColumns.PARTITION_COLUMN_ID, + MetadataColumns.PARTITION_COLUMN_NAME, + partitionType, + MetadataColumns.PARTITION_COLUMN_DOC); + Schema projectionSchema = new Schema(partitionField); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.PARTITION_COLUMN_ID, partitionData); + + Record partitionRecord = structLikeToRecord(partitionData, partitionType); + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + records, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.PARTITION_COLUMN_NAME, partitionRecord)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnPartitionEvolutionAddColumn(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema dataSchema = dataGenerator.schema(); + + // Old spec: partition by col_a only (spec id = 0) + PartitionSpec oldSpec = PartitionSpec.builderFor(dataSchema).identity("col_a").build(); + + // New spec: partition by col_a + col_b (spec id = 1, simulates partition evolution) + PartitionSpec newSpec = + PartitionSpec.builderFor(dataSchema) + .withSpecId(1) + .identity("col_a") + .identity("col_b") + .build(); + + // Partition data for the old file (only col_a is set, col_b is absent) + PartitionData oldPartitionData = new PartitionData(oldSpec.partitionType()); + oldPartitionData.set(0, "test_data"); + + // Write data using the old spec + DataWriter writer = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(dataSchema) + .spec(PartitionSpec.unpartitioned()) + .build(); + + List records = dataGenerator.generateRecords(); + + try (writer) { + records.forEach(writer::write); + } + + Types.StructType unifiedPartitionType = newSpec.partitionType(); + + // Build projection schema with PARTITION_COLUMN using the unified partition type + Types.NestedField partitionField = + Types.NestedField.optional( + MetadataColumns.PARTITION_COLUMN_ID, + MetadataColumns.PARTITION_COLUMN_NAME, + unifiedPartitionType, + MetadataColumns.PARTITION_COLUMN_DOC); + Schema projectionSchema = new Schema(partitionField); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.PARTITION_COLUMN_ID, oldPartitionData); + + Record partitionRecord = structLikeToRecord(oldPartitionData, unifiedPartitionType); + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + records, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.PARTITION_COLUMN_NAME, partitionRecord)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadMetadataColumnPartitionEvolutionRemoveColumn(FileFormat fileFormat) + throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema dataSchema = dataGenerator.schema(); + + PartitionSpec oldSpec = + PartitionSpec.builderFor(dataSchema).identity("col_a").identity("col_b").build(); + + PartitionSpec newSpec = + PartitionSpec.builderFor(dataSchema).withSpecId(1).identity("col_a").build(); + + // Partition data for the old file (both col_a and col_b are set) + PartitionData oldPartitionData = new PartitionData(oldSpec.partitionType()); + oldPartitionData.set(0, "test_col_a"); + oldPartitionData.set(1, 1); + + DataWriter writer = + FormatModelRegistry.dataWriteBuilder(fileFormat, Record.class, encryptedFile) + .schema(dataSchema) + .spec(PartitionSpec.unpartitioned()) + .build(); + + List records = dataGenerator.generateRecords(); + + try (writer) { + records.forEach(writer::write); + } + + // Use the new spec's partition type for projection (only col_a remains after evolution) + // This simulates reading an old file from the perspective of the new spec + Types.StructType newPartitionType = newSpec.partitionType(); + Types.NestedField partitionField = + Types.NestedField.optional( + MetadataColumns.PARTITION_COLUMN_ID, + MetadataColumns.PARTITION_COLUMN_NAME, + newPartitionType, + MetadataColumns.PARTITION_COLUMN_DOC); + Schema projectionSchema = new Schema(partitionField); + + Map idToConstant = + ImmutableMap.of(MetadataColumns.PARTITION_COLUMN_ID, oldPartitionData); + + Record partitionRecord = structLikeToRecord(oldPartitionData, newPartitionType); + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + records, + ignored -> + GenericRecord.create(projectionSchema) + .copy(MetadataColumns.PARTITION_COLUMN_NAME, partitionRecord)); + } + + private void readAndAssertGenericRecords( + FileFormat fileFormat, + Schema schema, + List sourceRecords, + Function transform) + throws IOException { + readAndAssertGenericRecords(fileFormat, schema, sourceRecords.stream().map(transform).toList()); + } + private void readAndAssertGenericRecords( FileFormat fileFormat, Schema schema, List expected) throws IOException { InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); @@ -947,20 +1374,6 @@ private DataFile writeGenericRecords( return dataFile; } - private List projectRecords(List records, Schema projectedSchema) { - return records.stream() - .map( - record -> { - Record projected = GenericRecord.create(projectedSchema.asStruct()); - projectedSchema - .columns() - .forEach( - field -> projected.setField(field.name(), record.getField(field.name()))); - return projected; - }) - .toList(); - } - private List convertToEngineRecords(List records, Schema schema) { return records.stream().map(r -> convertToEngine(r, schema)).toList(); } @@ -1275,4 +1688,93 @@ private void assertTruncateBoundsForFirstColumn( assertThat(dataFile.columnSizes()).isNotNull().isNotEmpty(); } + + private Map convertConstantsToEngine( + Schema projectionSchema, Map idToConstant) { + return idToConstant.entrySet().stream() + .collect( + ImmutableMap.toImmutableMap( + Map.Entry::getKey, + entry -> + convertConstantToEngine( + projectionSchema.findType(entry.getKey()), entry.getValue()))); + } + + private static Record structLikeToRecord(StructLike structLike, Types.StructType structType) { + Record record = GenericRecord.create(structType); + int sourceSize = structLike.size(); + for (int i = 0; i < structType.fields().size(); i++) { + if (i < sourceSize) { + record.set(i, structLike.get(i, Object.class)); + } else { + Types.NestedField field = structType.fields().get(i); + record.set(i, field.initialDefault()); + } + } + + return record; + } + + private void readAndAssertMetadataColumn( + FileFormat fileFormat, + Schema projectionSchema, + Map idToConstant, + List sourceRecords, + Function transform) + throws IOException { + readAndAssertMetadataColumn( + fileFormat, projectionSchema, idToConstant, sourceRecords.stream().map(transform).toList()); + } + + private void readAndAssertMetadataColumn( + FileFormat fileFormat, + Schema projectionSchema, + Map idToConstant, + List sourceRecords, + BiFunction transform) + throws IOException { + readAndAssertMetadataColumn( + fileFormat, + projectionSchema, + idToConstant, + IntStream.range(0, sourceRecords.size()) + .mapToObj(index -> transform.apply(index, sourceRecords.get(index))) + .toList()); + } + + private void readAndAssertMetadataColumn( + FileFormat fileFormat, + Schema projectionSchema, + Map idToConstant, + List expectedRecords) + throws IOException { + + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + + var readerBuilder = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(projectionSchema); + + if (idToConstant != null) { + readerBuilder.idToConstant(convertConstantsToEngine(projectionSchema, idToConstant)); + } + + try (CloseableIterable reader = readerBuilder.build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertThat(readRecords).hasSize(expectedRecords.size()); + assertEquals( + projectionSchema, convertToEngineRecords(expectedRecords, projectionSchema), readRecords); + } + + private static Record copy(Record source, Schema sourceSchema, Schema targetSchema) { + Record result = GenericRecord.create(targetSchema); + for (Types.NestedField col : sourceSchema.columns()) { + result.setField(col.name(), source.getField(col.name())); + } + + return result; + } } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java index 8c99fdf52110..1f0fe70ac53b 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java @@ -19,13 +19,17 @@ package org.apache.iceberg.flink.data; import java.util.List; +import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.Schema; import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataConverter; import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; public class TestFlinkFormatModel extends BaseFormatModelTests { @@ -48,4 +52,26 @@ protected RowData convertToEngine(Record record, Schema schema) { protected void assertEquals(Schema schema, List expected, List actual) { TestHelpers.assertRows(actual, expected, FlinkSchemaUtil.convert(schema)); } + + @Override + protected Object convertConstantToEngine(Type type, Object value) { + if (value instanceof PartitionData partitionData) { + Types.StructType structType = type.asStructType(); + List fields = structType.fields(); + GenericRowData rowData = new GenericRowData(fields.size()); + int sourceSize = partitionData.size(); + for (int i = 0; i < fields.size(); i++) { + if (i < sourceSize) { + Object fieldValue = partitionData.get(i, Object.class); + rowData.setField(i, convertConstantToEngine(fields.get(i).type(), fieldValue)); + } else { + rowData.setField(i, null); + } + } + + return rowData; + } + + return RowDataUtil.convertConstant(type, value); + } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java index 8c99fdf52110..1f0fe70ac53b 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java @@ -19,13 +19,17 @@ package org.apache.iceberg.flink.data; import java.util.List; +import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.Schema; import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataConverter; import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; public class TestFlinkFormatModel extends BaseFormatModelTests { @@ -48,4 +52,26 @@ protected RowData convertToEngine(Record record, Schema schema) { protected void assertEquals(Schema schema, List expected, List actual) { TestHelpers.assertRows(actual, expected, FlinkSchemaUtil.convert(schema)); } + + @Override + protected Object convertConstantToEngine(Type type, Object value) { + if (value instanceof PartitionData partitionData) { + Types.StructType structType = type.asStructType(); + List fields = structType.fields(); + GenericRowData rowData = new GenericRowData(fields.size()); + int sourceSize = partitionData.size(); + for (int i = 0; i < fields.size(); i++) { + if (i < sourceSize) { + Object fieldValue = partitionData.get(i, Object.class); + rowData.setField(i, convertConstantToEngine(fields.get(i).type(), fieldValue)); + } else { + rowData.setField(i, null); + } + } + + return rowData; + } + + return RowDataUtil.convertConstant(type, value); + } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java index 8c99fdf52110..1f0fe70ac53b 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkFormatModel.java @@ -19,13 +19,17 @@ package org.apache.iceberg.flink.data; import java.util.List; +import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; +import org.apache.iceberg.PartitionData; import org.apache.iceberg.Schema; import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.RowDataConverter; import org.apache.iceberg.flink.TestHelpers; +import org.apache.iceberg.types.Type; +import org.apache.iceberg.types.Types; public class TestFlinkFormatModel extends BaseFormatModelTests { @@ -48,4 +52,26 @@ protected RowData convertToEngine(Record record, Schema schema) { protected void assertEquals(Schema schema, List expected, List actual) { TestHelpers.assertRows(actual, expected, FlinkSchemaUtil.convert(schema)); } + + @Override + protected Object convertConstantToEngine(Type type, Object value) { + if (value instanceof PartitionData partitionData) { + Types.StructType structType = type.asStructType(); + List fields = structType.fields(); + GenericRowData rowData = new GenericRowData(fields.size()); + int sourceSize = partitionData.size(); + for (int i = 0; i < fields.size(); i++) { + if (i < sourceSize) { + Object fieldValue = partitionData.get(i, Object.class); + rowData.setField(i, convertConstantToEngine(fields.get(i).type(), fieldValue)); + } else { + rowData.setField(i, null); + } + } + + return rowData; + } + + return RowDataUtil.convertConstant(type, value); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java index c18e4c053f50..291bb2bca4f5 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkFormatModel.java @@ -25,6 +25,8 @@ import org.apache.iceberg.data.BaseFormatModelTests; import org.apache.iceberg.data.Record; import org.apache.iceberg.spark.SparkSchemaUtil; +import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.types.Type; import org.apache.spark.sql.catalyst.InternalRow; public class TestSparkFormatModel extends BaseFormatModelTests { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List { @@ -51,4 +53,9 @@ protected void assertEquals(Schema schema, List expected, List Date: Fri, 24 Apr 2026 16:57:32 -0500 Subject: [PATCH 098/197] Build: Check runtime deps baseline for all engine versions in CI (#16103) The check-runtime-deps job only validated default engine versions (Spark 4.1, Flink 2.1) because it did not enable all modules. Pass -DallModules=true so settings.gradle activates all known Spark, Flink, and Kafka versions from gradle.properties. --- .github/workflows/java-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index fbcabdb2f32e..3d489c574ff7 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -138,4 +138,4 @@ jobs: distribution: zulu java-version: 17 - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 - - run: ./gradlew checkAllRuntimeDeps -q + - run: ./gradlew checkAllRuntimeDeps -q -DallModules=true From 65faa6f350ee67361b6ef27450432f5df4f83a98 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Fri, 24 Apr 2026 18:16:07 -0400 Subject: [PATCH 099/197] Runtimes, Bundles: Add runtime-deps.txt files to track dependencies (#16081) --- aws-bundle/runtime-deps.txt | 70 ++++++ azure-bundle/runtime-deps.txt | 44 ++++ flink/v1.20/flink-runtime/runtime-deps.txt | 33 +++ flink/v2.0/flink-runtime/runtime-deps.txt | 33 +++ flink/v2.1/flink-runtime/runtime-deps.txt | 33 +++ gcp-bundle/runtime-deps.txt | 114 +++++++++ .../kafka-connect-runtime/runtime-deps.txt | 233 ++++++++++++++++++ spark/v3.4/spark-runtime/runtime-deps.txt | 40 +++ spark/v3.5/spark-runtime/runtime-deps.txt | 40 +++ spark/v4.0/spark-runtime/runtime-deps.txt | 40 +++ 10 files changed, 680 insertions(+) create mode 100644 aws-bundle/runtime-deps.txt create mode 100644 azure-bundle/runtime-deps.txt create mode 100644 flink/v1.20/flink-runtime/runtime-deps.txt create mode 100644 flink/v2.0/flink-runtime/runtime-deps.txt create mode 100644 flink/v2.1/flink-runtime/runtime-deps.txt create mode 100644 gcp-bundle/runtime-deps.txt create mode 100644 kafka-connect/kafka-connect-runtime/runtime-deps.txt create mode 100644 spark/v3.4/spark-runtime/runtime-deps.txt create mode 100644 spark/v3.5/spark-runtime/runtime-deps.txt create mode 100644 spark/v4.0/spark-runtime/runtime-deps.txt diff --git a/aws-bundle/runtime-deps.txt b/aws-bundle/runtime-deps.txt new file mode 100644 index 000000000000..4afb7bde1b32 --- /dev/null +++ b/aws-bundle/runtime-deps.txt @@ -0,0 +1,70 @@ +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.10.0 +commons-codec:commons-codec:1.17.1 +commons-logging:commons-logging:1.2 +io.netty:netty-buffer:4.1.132.Final +io.netty:netty-codec-http2:4.1.132.Final +io.netty:netty-codec-http:4.1.132.Final +io.netty:netty-codec:4.1.132.Final +io.netty:netty-common:4.1.132.Final +io.netty:netty-handler:4.1.132.Final +io.netty:netty-resolver:4.1.132.Final +io.netty:netty-transport-classes-epoll:4.1.132.Final +io.netty:netty-transport-native-unix-common:4.1.132.Final +io.netty:netty-transport:4.1.132.Final +org.apache.httpcomponents:httpclient:4.5.13 +org.apache.httpcomponents:httpcore:4.4.16 +org.apache.logging.log4j:log4j-api:2.20.0 +org.apache.logging.log4j:log4j-core:2.20.0 +org.apache.logging.log4j:log4j-slf4j-impl:2.20.0 +org.checkerframework:checker-qual:3.19.0 +org.reactivestreams:reactive-streams:1.0.4 +org.slf4j:slf4j-api:2.0.17 +software.amazon.awssdk.crt:aws-crt:0.43.9 +software.amazon.awssdk:annotations:2.42.33 +software.amazon.awssdk:apache-client:2.42.33 +software.amazon.awssdk:arns:2.42.33 +software.amazon.awssdk:auth:2.42.33 +software.amazon.awssdk:aws-core:2.42.33 +software.amazon.awssdk:aws-json-protocol:2.42.33 +software.amazon.awssdk:aws-query-protocol:2.42.33 +software.amazon.awssdk:aws-xml-protocol:2.42.33 +software.amazon.awssdk:checksums-spi:2.42.33 +software.amazon.awssdk:checksums:2.42.33 +software.amazon.awssdk:cloudwatch-metric-publisher:2.42.33 +software.amazon.awssdk:cloudwatch:2.42.33 +software.amazon.awssdk:crt-core:2.42.33 +software.amazon.awssdk:dynamodb:2.42.33 +software.amazon.awssdk:endpoints-spi:2.42.33 +software.amazon.awssdk:glue:2.42.33 +software.amazon.awssdk:http-auth-aws-crt:2.42.33 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.33 +software.amazon.awssdk:http-auth-aws:2.42.33 +software.amazon.awssdk:http-auth-spi:2.42.33 +software.amazon.awssdk:http-auth:2.42.33 +software.amazon.awssdk:http-client-spi:2.42.33 +software.amazon.awssdk:iam:2.42.33 +software.amazon.awssdk:identity-spi:2.42.33 +software.amazon.awssdk:json-utils:2.42.33 +software.amazon.awssdk:kms:2.42.33 +software.amazon.awssdk:lakeformation:2.42.33 +software.amazon.awssdk:metrics-spi:2.42.33 +software.amazon.awssdk:netty-nio-client:2.42.33 +software.amazon.awssdk:profiles:2.42.33 +software.amazon.awssdk:protocol-core:2.42.33 +software.amazon.awssdk:regions:2.42.33 +software.amazon.awssdk:retries-spi:2.42.33 +software.amazon.awssdk:retries:2.42.33 +software.amazon.awssdk:s3:2.42.33 +software.amazon.awssdk:s3control:2.42.33 +software.amazon.awssdk:sdk-core:2.42.33 +software.amazon.awssdk:smithy-rpcv2-protocol:2.42.33 +software.amazon.awssdk:sso:2.42.33 +software.amazon.awssdk:sts:2.42.33 +software.amazon.awssdk:third-party-jackson-core:2.42.33 +software.amazon.awssdk:third-party-jackson-dataformat-cbor:2.42.33 +software.amazon.awssdk:utils-lite:2.42.33 +software.amazon.awssdk:utils:2.42.33 +software.amazon.eventstream:eventstream:1.0.1 +software.amazon.s3.accessgrants:aws-s3-accessgrants-java-plugin:2.4.1 +software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3:1.3.1 diff --git a/azure-bundle/runtime-deps.txt b/azure-bundle/runtime-deps.txt new file mode 100644 index 000000000000..273146654a31 --- /dev/null +++ b/azure-bundle/runtime-deps.txt @@ -0,0 +1,44 @@ +com.azure:azure-core-http-netty:1.16.3 +com.azure:azure-core:1.57.1 +com.azure:azure-identity:1.18.2 +com.azure:azure-json:1.5.1 +com.azure:azure-security-keyvault-keys:4.10.6 +com.azure:azure-storage-blob:12.33.3 +com.azure:azure-storage-common:12.32.2 +com.azure:azure-storage-file-datalake:12.26.3 +com.azure:azure-storage-internal-avro:12.18.2 +com.azure:azure-xml:1.2.1 +com.fasterxml.jackson.core:jackson-annotations:2.18.4 +com.fasterxml.jackson.core:jackson-core:2.18.4.1 +com.fasterxml.jackson.core:jackson-databind:2.18.4 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.18.4 +com.microsoft.azure:msal4j-persistence-extension:1.3.0 +com.microsoft.azure:msal4j:1.23.1 +io.netty:netty-buffer:4.1.130.Final +io.netty:netty-codec-dns:4.1.128.Final +io.netty:netty-codec-http2:4.1.130.Final +io.netty:netty-codec-http:4.1.130.Final +io.netty:netty-codec-socks:4.1.130.Final +io.netty:netty-codec:4.1.130.Final +io.netty:netty-common:4.1.130.Final +io.netty:netty-handler-proxy:4.1.130.Final +io.netty:netty-handler:4.1.130.Final +io.netty:netty-resolver-dns-classes-macos:4.1.128.Final +io.netty:netty-resolver-dns-native-macos:4.1.128.Final +io.netty:netty-resolver-dns:4.1.128.Final +io.netty:netty-resolver:4.1.130.Final +io.netty:netty-tcnative-boringssl-static:2.0.74.Final +io.netty:netty-tcnative-classes:2.0.74.Final +io.netty:netty-transport-classes-epoll:4.1.130.Final +io.netty:netty-transport-classes-kqueue:4.1.130.Final +io.netty:netty-transport-native-epoll:4.1.130.Final +io.netty:netty-transport-native-kqueue:4.1.130.Final +io.netty:netty-transport-native-unix-common:4.1.130.Final +io.netty:netty-transport:4.1.130.Final +io.projectreactor.netty:reactor-netty-core:1.2.13 +io.projectreactor.netty:reactor-netty-http:1.2.13 +io.projectreactor:reactor-core:3.7.14 +net.java.dev.jna:jna-platform:5.17.0 +net.java.dev.jna:jna:5.17.0 +org.reactivestreams:reactive-streams:1.0.4 +org.slf4j:slf4j-api:2.0.17 diff --git a/flink/v1.20/flink-runtime/runtime-deps.txt b/flink/v1.20/flink-runtime/runtime-deps.txt new file mode 100644 index 000000000000..7c7aed1e4357 --- /dev/null +++ b/flink/v1.20/flink-runtime/runtime-deps.txt @@ -0,0 +1,33 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.2 +com.fasterxml.jackson.core:jackson-databind:2.21.2 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.google.errorprone:error_prone_annotations:2.10.0 +dev.failsafe:failsafe:3.3.2 +io.airlift:aircompressor:2.0.3 +io.dropwizard.metrics:metrics-core:3.2.6 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.flink:flink-metrics-dropwizard:1.20.1 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.4 +org.projectnessie.nessie:nessie-model:0.107.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/flink/v2.0/flink-runtime/runtime-deps.txt b/flink/v2.0/flink-runtime/runtime-deps.txt new file mode 100644 index 000000000000..c70e3fbba92c --- /dev/null +++ b/flink/v2.0/flink-runtime/runtime-deps.txt @@ -0,0 +1,33 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.2 +com.fasterxml.jackson.core:jackson-databind:2.21.2 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.google.errorprone:error_prone_annotations:2.10.0 +dev.failsafe:failsafe:3.3.2 +io.airlift:aircompressor:2.0.3 +io.dropwizard.metrics:metrics-core:3.2.6 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.flink:flink-metrics-dropwizard:2.0.0 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.4 +org.projectnessie.nessie:nessie-model:0.107.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/flink/v2.1/flink-runtime/runtime-deps.txt b/flink/v2.1/flink-runtime/runtime-deps.txt new file mode 100644 index 000000000000..3dfc56f15ea9 --- /dev/null +++ b/flink/v2.1/flink-runtime/runtime-deps.txt @@ -0,0 +1,33 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.2 +com.fasterxml.jackson.core:jackson-databind:2.21.2 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.google.errorprone:error_prone_annotations:2.10.0 +dev.failsafe:failsafe:3.3.2 +io.airlift:aircompressor:2.0.3 +io.dropwizard.metrics:metrics-core:3.2.6 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.flink:flink-metrics-dropwizard:2.1.0 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.4 +org.projectnessie.nessie:nessie-model:0.107.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/gcp-bundle/runtime-deps.txt b/gcp-bundle/runtime-deps.txt new file mode 100644 index 000000000000..b62c772db61a --- /dev/null +++ b/gcp-bundle/runtime-deps.txt @@ -0,0 +1,114 @@ +com.fasterxml.jackson.core:jackson-annotations:2.18.2 +com.fasterxml.jackson.core:jackson-core:2.18.2 +com.fasterxml.jackson.core:jackson-databind:2.18.2 +com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.18.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.18.2 +com.fasterxml.woodstox:woodstox-core:7.0.0 +com.google.android:annotations:4.1.1.4 +com.google.api-client:google-api-client:2.7.2 +com.google.api.grpc:gapic-google-cloud-storage-v2:2.64.1 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.24.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.196.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.196.0 +com.google.api.grpc:grpc-google-cloud-storage-v2:2.64.1 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.24.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.24.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.196.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.196.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.24.0 +com.google.api.grpc:proto-google-cloud-kms-v1:0.182.0 +com.google.api.grpc:proto-google-cloud-monitoring-v3:3.89.0 +com.google.api.grpc:proto-google-cloud-storage-v2:2.64.1 +com.google.api.grpc:proto-google-common-protos:2.67.0 +com.google.api.grpc:proto-google-iam-v1:1.62.0 +com.google.api:api-common:2.59.0 +com.google.api:gax-grpc:2.76.0 +com.google.api:gax-httpjson:2.76.0 +com.google.api:gax:2.76.0 +com.google.apis:google-api-services-bigquery:v2-rev20251012-2.0.0 +com.google.apis:google-api-services-storage:v1-rev20260204-2.0.0 +com.google.auth:google-auth-library-credentials:1.43.0 +com.google.auth:google-auth-library-oauth2-http:1.43.0 +com.google.auto.value:auto-value-annotations:1.11.1 +com.google.cloud.gcs.analytics:client:1.2.3 +com.google.cloud.gcs.analytics:gcs-analytics-core:1.2.3 +com.google.cloud.opentelemetry:detector-resources-support:0.33.0 +com.google.cloud.opentelemetry:exporter-metrics:0.33.0 +com.google.cloud.opentelemetry:shared-resourcemapping:0.33.0 +com.google.cloud:google-cloud-bigquery:2.62.0 +com.google.cloud:google-cloud-bigquerystorage:3.24.0 +com.google.cloud:google-cloud-core-grpc:2.66.0 +com.google.cloud:google-cloud-core-http:2.66.0 +com.google.cloud:google-cloud-core:2.66.0 +com.google.cloud:google-cloud-kms:2.91.0 +com.google.cloud:google-cloud-monitoring:3.89.0 +com.google.cloud:google-cloud-storage:2.64.1 +com.google.code.findbugs:jsr305:3.0.2 +com.google.code.gson:gson:2.12.1 +com.google.errorprone:error_prone_annotations:2.42.0 +com.google.flatbuffers:flatbuffers-java:24.3.25 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.http-client:google-http-client-apache-v2:2.1.0 +com.google.http-client:google-http-client-appengine:2.1.0 +com.google.http-client:google-http-client-gson:2.1.0 +com.google.http-client:google-http-client-jackson2:2.1.0 +com.google.http-client:google-http-client:2.1.0 +com.google.j2objc:j2objc-annotations:3.1 +com.google.oauth-client:google-oauth-client:1.39.0 +com.google.protobuf:protobuf-java-util:4.33.2 +com.google.protobuf:protobuf-java:4.33.2 +com.google.re2j:re2j:1.8 +commons-codec:commons-codec:1.18.0 +io.grpc:grpc-alts:1.76.3 +io.grpc:grpc-api:1.76.3 +io.grpc:grpc-auth:1.76.3 +io.grpc:grpc-context:1.76.3 +io.grpc:grpc-core:1.76.3 +io.grpc:grpc-googleapis:1.76.3 +io.grpc:grpc-grpclb:1.76.3 +io.grpc:grpc-inprocess:1.76.3 +io.grpc:grpc-netty-shaded:1.76.3 +io.grpc:grpc-opentelemetry:1.76.3 +io.grpc:grpc-protobuf-lite:1.76.3 +io.grpc:grpc-protobuf:1.76.3 +io.grpc:grpc-rls:1.76.3 +io.grpc:grpc-services:1.76.3 +io.grpc:grpc-stub:1.76.3 +io.grpc:grpc-util:1.76.3 +io.grpc:grpc-xds:1.76.3 +io.netty:netty-buffer:4.1.110.Final +io.netty:netty-common:4.1.110.Final +io.opencensus:opencensus-api:0.31.1 +io.opencensus:opencensus-contrib-http-util:0.31.1 +io.opentelemetry.contrib:opentelemetry-gcp-resources:1.37.0-alpha +io.opentelemetry.semconv:opentelemetry-semconv:1.29.0-alpha +io.opentelemetry:opentelemetry-api:1.51.0 +io.opentelemetry:opentelemetry-context:1.51.0 +io.opentelemetry:opentelemetry-sdk-common:1.51.0 +io.opentelemetry:opentelemetry-sdk-extension-autoconfigure-spi:1.51.0 +io.opentelemetry:opentelemetry-sdk-logs:1.51.0 +io.opentelemetry:opentelemetry-sdk-metrics:1.51.0 +io.opentelemetry:opentelemetry-sdk-trace:1.51.0 +io.opentelemetry:opentelemetry-sdk:1.51.0 +io.perfmark:perfmark-api:0.27.0 +javax.annotation:javax.annotation-api:1.3.2 +org.apache.arrow:arrow-format:17.0.0 +org.apache.arrow:arrow-memory-core:17.0.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:17.0.0 +org.apache.arrow:arrow-memory-netty:17.0.0 +org.apache.arrow:arrow-vector:17.0.0 +org.apache.commons:commons-lang3:3.20.0 +org.apache.httpcomponents:httpclient:4.5.14 +org.apache.httpcomponents:httpcore:4.4.16 +org.checkerframework:checker-compat-qual:2.5.6 +org.checkerframework:checker-qual:3.49.0 +org.codehaus.mojo:animal-sniffer-annotations:1.24 +org.codehaus.woodstox:stax2-api:4.2.2 +org.conscrypt:conscrypt-openjdk-uber:2.5.2 +org.json:json:20250517 +org.jspecify:jspecify:1.0.0 +org.slf4j:slf4j-api:2.0.17 +org.threeten:threeten-extra:1.8.0 +org.threeten:threetenbp:1.7.0 diff --git a/kafka-connect/kafka-connect-runtime/runtime-deps.txt b/kafka-connect/kafka-connect-runtime/runtime-deps.txt new file mode 100644 index 000000000000..98b7ced14217 --- /dev/null +++ b/kafka-connect/kafka-connect-runtime/runtime-deps.txt @@ -0,0 +1,233 @@ +com.azure:azure-core-http-netty:1.16.3 +com.azure:azure-core:1.57.1 +com.azure:azure-identity:1.18.2 +com.azure:azure-json:1.5.1 +com.azure:azure-storage-blob:12.33.3 +com.azure:azure-storage-common:12.32.2 +com.azure:azure-storage-file-datalake:12.26.3 +com.azure:azure-storage-internal-avro:12.18.2 +com.azure:azure-xml:1.2.1 +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.21.2 +com.fasterxml.jackson.core:jackson-databind:2.21.2 +com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.21.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.2 +com.fasterxml.woodstox:woodstox-core:6.7.0 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.github.luben:zstd-jni:1.5.7-3 +com.github.pjfanning:jersey-json:1.22.0 +com.google.android:annotations:4.1.1.4 +com.google.api-client:google-api-client:2.7.2 +com.google.api.grpc:gapic-google-cloud-storage-v2:2.64.1 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.24.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.196.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.196.0 +com.google.api.grpc:grpc-google-cloud-storage-v2:2.64.1 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.24.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.24.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.196.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.196.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.24.0 +com.google.api.grpc:proto-google-cloud-monitoring-v3:3.89.0 +com.google.api.grpc:proto-google-cloud-storage-v2:2.64.1 +com.google.api.grpc:proto-google-common-protos:2.67.0 +com.google.api.grpc:proto-google-iam-v1:1.62.0 +com.google.api:api-common:2.59.0 +com.google.api:gax-grpc:2.76.0 +com.google.api:gax-httpjson:2.76.0 +com.google.api:gax:2.76.0 +com.google.apis:google-api-services-bigquery:v2-rev20251012-2.0.0 +com.google.apis:google-api-services-storage:v1-rev20260204-2.0.0 +com.google.auth:google-auth-library-credentials:1.43.0 +com.google.auth:google-auth-library-oauth2-http:1.43.0 +com.google.auto.value:auto-value-annotations:1.11.0 +com.google.cloud.opentelemetry:detector-resources-support:0.33.0 +com.google.cloud.opentelemetry:exporter-metrics:0.33.0 +com.google.cloud.opentelemetry:shared-resourcemapping:0.33.0 +com.google.cloud:google-cloud-bigquery:2.62.0 +com.google.cloud:google-cloud-bigquerystorage:3.24.0 +com.google.cloud:google-cloud-core-grpc:2.66.0 +com.google.cloud:google-cloud-core-http:2.66.0 +com.google.cloud:google-cloud-core:2.66.0 +com.google.cloud:google-cloud-monitoring:3.89.0 +com.google.cloud:google-cloud-storage:2.64.1 +com.google.code.findbugs:jsr305:3.0.2 +com.google.code.gson:gson:2.12.1 +com.google.errorprone:error_prone_annotations:2.42.0 +com.google.flatbuffers:flatbuffers-java:24.3.25 +com.google.guava:failureaccess:1.0.3 +com.google.guava:guava:33.5.0-jre +com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava +com.google.http-client:google-http-client-apache-v2:2.1.0 +com.google.http-client:google-http-client-appengine:2.1.0 +com.google.http-client:google-http-client-gson:2.1.0 +com.google.http-client:google-http-client-jackson2:2.1.0 +com.google.http-client:google-http-client:2.1.0 +com.google.j2objc:j2objc-annotations:3.1 +com.google.oauth-client:google-oauth-client:1.39.0 +com.google.protobuf:protobuf-java-util:4.33.2 +com.google.protobuf:protobuf-java:4.33.2 +com.google.re2j:re2j:1.8 +com.jcraft:jsch:0.1.55 +com.microsoft.azure:msal4j-persistence-extension:1.3.0 +com.microsoft.azure:msal4j:1.23.1 +com.sun.xml.bind:jaxb-impl:2.2.3-1 +commons-cli:commons-cli:1.9.0 +commons-codec:commons-codec:1.19.0 +commons-io:commons-io:2.20.0 +commons-logging:commons-logging:1.2 +commons-net:commons-net:3.9.0 +commons-pool:commons-pool:1.6 +dev.failsafe:failsafe:3.3.2 +dnsjava:dnsjava:3.6.1 +io.airlift:aircompressor:2.0.3 +io.dropwizard.metrics:metrics-core:3.2.4 +io.grpc:grpc-alts:1.76.3 +io.grpc:grpc-api:1.76.3 +io.grpc:grpc-auth:1.76.3 +io.grpc:grpc-context:1.76.3 +io.grpc:grpc-core:1.76.3 +io.grpc:grpc-googleapis:1.76.3 +io.grpc:grpc-grpclb:1.76.3 +io.grpc:grpc-inprocess:1.76.3 +io.grpc:grpc-netty-shaded:1.80.0 +io.grpc:grpc-opentelemetry:1.76.3 +io.grpc:grpc-protobuf-lite:1.76.3 +io.grpc:grpc-protobuf:1.76.3 +io.grpc:grpc-rls:1.76.3 +io.grpc:grpc-services:1.76.3 +io.grpc:grpc-stub:1.76.3 +io.grpc:grpc-util:1.76.3 +io.grpc:grpc-xds:1.76.3 +io.netty:netty-buffer:4.1.132.Final +io.netty:netty-codec-dns:4.1.128.Final +io.netty:netty-codec-http2:4.1.132.Final +io.netty:netty-codec-http:4.1.132.Final +io.netty:netty-codec-socks:4.1.130.Final +io.netty:netty-codec:4.1.132.Final +io.netty:netty-common:4.1.132.Final +io.netty:netty-handler-proxy:4.1.130.Final +io.netty:netty-handler:4.1.132.Final +io.netty:netty-resolver-dns-classes-macos:4.1.128.Final +io.netty:netty-resolver-dns-native-macos:4.1.128.Final +io.netty:netty-resolver-dns:4.1.128.Final +io.netty:netty-resolver:4.1.132.Final +io.netty:netty-tcnative-boringssl-static:2.0.74.Final +io.netty:netty-tcnative-classes:2.0.74.Final +io.netty:netty-transport-classes-epoll:4.1.132.Final +io.netty:netty-transport-classes-kqueue:4.1.130.Final +io.netty:netty-transport-native-epoll:4.1.130.Final +io.netty:netty-transport-native-kqueue:4.1.130.Final +io.netty:netty-transport-native-unix-common:4.1.132.Final +io.netty:netty-transport:4.1.132.Final +io.opencensus:opencensus-api:0.31.1 +io.opencensus:opencensus-contrib-http-util:0.31.1 +io.opentelemetry.contrib:opentelemetry-gcp-resources:1.37.0-alpha +io.opentelemetry.semconv:opentelemetry-semconv:1.29.0-alpha +io.opentelemetry:opentelemetry-api:1.51.0 +io.opentelemetry:opentelemetry-context:1.51.0 +io.opentelemetry:opentelemetry-sdk-common:1.51.0 +io.opentelemetry:opentelemetry-sdk-extension-autoconfigure-spi:1.51.0 +io.opentelemetry:opentelemetry-sdk-logs:1.51.0 +io.opentelemetry:opentelemetry-sdk-metrics:1.51.0 +io.opentelemetry:opentelemetry-sdk-trace:1.51.0 +io.opentelemetry:opentelemetry-sdk:1.51.0 +io.perfmark:perfmark-api:0.27.0 +io.projectreactor.netty:reactor-netty-core:1.2.13 +io.projectreactor.netty:reactor-netty-http:1.2.13 +io.projectreactor:reactor-core:3.7.14 +jakarta.activation:jakarta.activation-api:1.2.1 +javax.annotation:javax.annotation-api:1.3.2 +javax.servlet.jsp:jsp-api:2.1 +javax.servlet:javax.servlet-api:3.1.0 +javax.xml.bind:jaxb-api:2.2.2 +javax.xml.stream:stax-api:1.0-2 +net.java.dev.jna:jna-platform:5.17.0 +net.java.dev.jna:jna:5.17.0 +org.apache.arrow:arrow-format:17.0.0 +org.apache.arrow:arrow-memory-core:17.0.0 +org.apache.arrow:arrow-memory-netty-buffer-patch:17.0.0 +org.apache.arrow:arrow-memory-netty:17.0.0 +org.apache.arrow:arrow-vector:17.0.0 +org.apache.avro:avro:1.12.1 +org.apache.commons:commons-collections4:4.4 +org.apache.commons:commons-compress:1.28.0 +org.apache.commons:commons-lang3:3.18.0 +org.apache.commons:commons-math3:3.6.1 +org.apache.commons:commons-text:1.14.0 +org.apache.hadoop.thirdparty:hadoop-shaded-guava:1.5.0 +org.apache.hadoop.thirdparty:hadoop-shaded-protobuf_3_25:1.5.0 +org.apache.hadoop:hadoop-annotations:3.4.3 +org.apache.hadoop:hadoop-common:3.4.3 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.httpcomponents:httpclient:4.5.14 +org.apache.httpcomponents:httpcore:4.4.16 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.bouncycastle:bcprov-jdk18on:1.82 +org.checkerframework:checker-compat-qual:2.5.6 +org.checkerframework:checker-qual:3.49.0 +org.codehaus.jettison:jettison:1.5.5 +org.codehaus.mojo:animal-sniffer-annotations:1.24 +org.codehaus.woodstox:stax2-api:4.2.2 +org.conscrypt:conscrypt-openjdk-uber:2.5.2 +org.json:json:20250517 +org.locationtech.jts:jts-core:1.20.0 +org.mongodb:bson:4.11.5 +org.reactivestreams:reactive-streams:1.0.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.slf4j:slf4j-api:2.0.17 +org.threeten:threeten-extra:1.8.0 +org.threeten:threetenbp:1.7.0 +org.xerial.snappy:snappy-java:1.1.10.8 +software.amazon.awssdk.crt:aws-crt:0.43.9 +software.amazon.awssdk:annotations:2.42.33 +software.amazon.awssdk:apache-client:2.42.33 +software.amazon.awssdk:arns:2.42.33 +software.amazon.awssdk:auth:2.42.33 +software.amazon.awssdk:aws-core:2.42.33 +software.amazon.awssdk:aws-json-protocol:2.42.33 +software.amazon.awssdk:aws-query-protocol:2.42.33 +software.amazon.awssdk:aws-xml-protocol:2.42.33 +software.amazon.awssdk:checksums-spi:2.42.33 +software.amazon.awssdk:checksums:2.42.33 +software.amazon.awssdk:crt-core:2.42.33 +software.amazon.awssdk:dynamodb:2.42.33 +software.amazon.awssdk:endpoints-spi:2.42.33 +software.amazon.awssdk:glue:2.42.33 +software.amazon.awssdk:http-auth-aws-crt:2.42.33 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.33 +software.amazon.awssdk:http-auth-aws:2.42.33 +software.amazon.awssdk:http-auth-spi:2.42.33 +software.amazon.awssdk:http-auth:2.42.33 +software.amazon.awssdk:http-client-spi:2.42.33 +software.amazon.awssdk:iam:2.42.33 +software.amazon.awssdk:identity-spi:2.42.33 +software.amazon.awssdk:json-utils:2.42.33 +software.amazon.awssdk:kms:2.42.33 +software.amazon.awssdk:lakeformation:2.42.33 +software.amazon.awssdk:metrics-spi:2.42.33 +software.amazon.awssdk:netty-nio-client:2.42.33 +software.amazon.awssdk:profiles:2.42.33 +software.amazon.awssdk:protocol-core:2.42.33 +software.amazon.awssdk:regions:2.42.33 +software.amazon.awssdk:retries-spi:2.42.33 +software.amazon.awssdk:retries:2.42.33 +software.amazon.awssdk:s3:2.42.33 +software.amazon.awssdk:sdk-core:2.42.33 +software.amazon.awssdk:sso:2.42.33 +software.amazon.awssdk:sts:2.42.33 +software.amazon.awssdk:third-party-jackson-core:2.42.33 +software.amazon.awssdk:utils-lite:2.42.33 +software.amazon.awssdk:utils:2.42.33 +software.amazon.eventstream:eventstream:1.0.1 diff --git a/spark/v3.4/spark-runtime/runtime-deps.txt b/spark/v3.4/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..fa0b58c856c0 --- /dev/null +++ b/spark/v3.4/spark-runtime/runtime-deps.txt @@ -0,0 +1,40 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.14.2 +com.fasterxml.jackson.core:jackson-databind:2.14.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.2 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.10.0 +com.google.flatbuffers:flatbuffers-java:23.5.26 +dev.failsafe:failsafe:3.3.2 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final +org.apache.arrow:arrow-format:15.0.2 +org.apache.arrow:arrow-memory-core:15.0.2 +org.apache.arrow:arrow-memory-netty:15.0.2 +org.apache.arrow:arrow-vector:15.0.2 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.collections:eclipse-collections-api:11.1.0 +org.eclipse.collections:eclipse-collections:11.1.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.4 +org.projectnessie.nessie:nessie-model:0.107.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/spark/v3.5/spark-runtime/runtime-deps.txt b/spark/v3.5/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..9a087517cbb0 --- /dev/null +++ b/spark/v3.5/spark-runtime/runtime-deps.txt @@ -0,0 +1,40 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.15.2 +com.fasterxml.jackson.core:jackson-databind:2.15.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.2 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.10.0 +com.google.flatbuffers:flatbuffers-java:23.5.26 +dev.failsafe:failsafe:3.3.2 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final +org.apache.arrow:arrow-format:15.0.2 +org.apache.arrow:arrow-memory-core:15.0.2 +org.apache.arrow:arrow-memory-netty:15.0.2 +org.apache.arrow:arrow-vector:15.0.2 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.collections:eclipse-collections-api:11.1.0 +org.eclipse.collections:eclipse-collections:11.1.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.4 +org.projectnessie.nessie:nessie-model:0.107.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 diff --git a/spark/v4.0/spark-runtime/runtime-deps.txt b/spark/v4.0/spark-runtime/runtime-deps.txt new file mode 100644 index 000000000000..9a087517cbb0 --- /dev/null +++ b/spark/v4.0/spark-runtime/runtime-deps.txt @@ -0,0 +1,40 @@ +com.fasterxml.jackson.core:jackson-annotations:2.21 +com.fasterxml.jackson.core:jackson-core:2.15.2 +com.fasterxml.jackson.core:jackson-databind:2.15.2 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.21.2 +com.github.ben-manes.caffeine:caffeine:2.9.3 +com.google.errorprone:error_prone_annotations:2.10.0 +com.google.flatbuffers:flatbuffers-java:23.5.26 +dev.failsafe:failsafe:3.3.2 +io.airlift:aircompressor:2.0.3 +io.netty:netty-buffer:4.2.12.Final +io.netty:netty-common:4.2.12.Final +org.apache.arrow:arrow-format:15.0.2 +org.apache.arrow:arrow-memory-core:15.0.2 +org.apache.arrow:arrow-memory-netty:15.0.2 +org.apache.arrow:arrow-vector:15.0.2 +org.apache.avro:avro:1.12.1 +org.apache.datasketches:datasketches-java:6.2.0 +org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.httpcomponents.client5:httpclient5:5.6 +org.apache.httpcomponents.core5:httpcore5-h2:5.4 +org.apache.httpcomponents.core5:httpcore5:5.4 +org.apache.orc:orc-core:1.9.8 +org.apache.orc:orc-shims:1.9.8 +org.apache.parquet:parquet-avro:1.17.0 +org.apache.parquet:parquet-column:1.17.0 +org.apache.parquet:parquet-common:1.17.0 +org.apache.parquet:parquet-encoding:1.17.0 +org.apache.parquet:parquet-format-structures:1.17.0 +org.apache.parquet:parquet-hadoop:1.17.0 +org.apache.parquet:parquet-jackson:1.17.0 +org.apache.parquet:parquet-variant:1.17.0 +org.checkerframework:checker-qual:3.19.0 +org.eclipse.collections:eclipse-collections-api:11.1.0 +org.eclipse.collections:eclipse-collections:11.1.0 +org.eclipse.microprofile.openapi:microprofile-openapi-api:4.1.1 +org.locationtech.jts:jts-core:1.20.0 +org.projectnessie.nessie:nessie-client:0.107.4 +org.projectnessie.nessie:nessie-model:0.107.4 +org.roaringbitmap:RoaringBitmap:1.6.14 +org.threeten:threeten-extra:1.7.1 From f29a182eccc9287764a511767a79f741eb4135af Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 24 Apr 2026 17:06:49 -0700 Subject: [PATCH 100/197] GCP Bundle: Remove JSR 305 (#16106) --- gcp-bundle/LICENSE | 7 ------- gcp-bundle/build.gradle | 6 ++++++ gcp-bundle/runtime-deps.txt | 1 - 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/gcp-bundle/LICENSE b/gcp-bundle/LICENSE index d6d61ae4c578..10c87d69c720 100644 --- a/gcp-bundle/LICENSE +++ b/gcp-bundle/LICENSE @@ -380,13 +380,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Findbugs jsr305. - -Project URL: http://findbugs.sourceforge.net/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles Google Error Prone Annotations. Project URL: https://github.com/google/error-prone diff --git a/gcp-bundle/build.gradle b/gcp-bundle/build.gradle index 1f6642c9b2ce..d48763b5e366 100644 --- a/gcp-bundle/build.gradle +++ b/gcp-bundle/build.gradle @@ -23,6 +23,12 @@ project(":iceberg-gcp-bundle") { tasks.jar.dependsOn tasks.shadowJar + configurations { + implementation { + exclude group: 'com.google.code.findbugs', module: 'jsr305' + } + } + dependencies { implementation platform(libs.google.libraries.bom) implementation "com.google.cloud:google-cloud-storage" diff --git a/gcp-bundle/runtime-deps.txt b/gcp-bundle/runtime-deps.txt index b62c772db61a..2c3331f5261b 100644 --- a/gcp-bundle/runtime-deps.txt +++ b/gcp-bundle/runtime-deps.txt @@ -43,7 +43,6 @@ com.google.cloud:google-cloud-core:2.66.0 com.google.cloud:google-cloud-kms:2.91.0 com.google.cloud:google-cloud-monitoring:3.89.0 com.google.cloud:google-cloud-storage:2.64.1 -com.google.code.findbugs:jsr305:3.0.2 com.google.code.gson:gson:2.12.1 com.google.errorprone:error_prone_annotations:2.42.0 com.google.flatbuffers:flatbuffers-java:24.3.25 From 3d4c6e00d1864746d99791f160a64b70dbaaa6af Mon Sep 17 00:00:00 2001 From: M Alvee Date: Sat, 25 Apr 2026 05:00:01 +0100 Subject: [PATCH 101/197] test: add ns1/ns2 to RCK view test namespace purge list (#16050) --- .../java/org/apache/iceberg/rest/RCKUtils.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java index 4bd060d788a7..bfdcfc8a4bd9 100644 --- a/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java +++ b/open-api/src/testFixtures/java/org/apache/iceberg/rest/RCKUtils.java @@ -37,7 +37,13 @@ class RCKUtils { static final String RCK_LOCAL = "rck.local"; static final String RCK_PURGE_TEST_NAMESPACES = "rck.purge-test-namespaces"; - static final List TEST_NAMESPACES = List.of(Namespace.of("ns"), Namespace.of("newdb")); + static final List TEST_NAMESPACES = + List.of( + Namespace.of("ns"), + Namespace.of("newdb"), + Namespace.of("ns1"), + Namespace.of("ns2"), + Namespace.of("other_ns")); private RCKUtils() {} From f7ca134d5f75635709640713fb5c8d4962a1e50f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:48:06 -0700 Subject: [PATCH 102/197] Build: Bump zizmorcore/zizmor-action from 0.5.2 to 0.5.3 (#16122) Bumps [zizmorcore/zizmor-action](https://github.com/zizmorcore/zizmor-action) from 0.5.2 to 0.5.3. - [Release notes](https://github.com/zizmorcore/zizmor-action/releases) - [Commits](https://github.com/zizmorcore/zizmor-action/compare/71321a20a9ded102f6e9ce5718a2fcec2c4f70d8...b1d7e1fb5de872772f31590499237e7cce841e8e) --- updated-dependencies: - dependency-name: zizmorcore/zizmor-action dependency-version: 0.5.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/zizmor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml index 02d49c5473ab..0df5e1362ac5 100644 --- a/.github/workflows/zizmor.yml +++ b/.github/workflows/zizmor.yml @@ -39,7 +39,7 @@ jobs: persist-credentials: false - name: Run zizmor 🌈 - uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2 + uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3 with: advanced-security: false min-severity: medium From dd93aacb6bc5a7b329e16c3e9a102fb56ac8638e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:48:21 -0700 Subject: [PATCH 103/197] Build: Bump astral-sh/setup-uv from 8.0.0 to 8.1.0 (#16121) Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 8.0.0 to 8.1.0. - [Release notes](https://github.com/astral-sh/setup-uv/releases) - [Commits](https://github.com/astral-sh/setup-uv/compare/cec208311dfd045dd5311c1add060b2062131d57...08807647e7069bb48b6ef5acd8ec9567f424441b) --- updated-dependencies: - dependency-name: astral-sh/setup-uv dependency-version: 8.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/open-api.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/open-api.yml b/.github/workflows/open-api.yml index 2d58d0dcf023..fdc5bcda679e 100644 --- a/.github/workflows/open-api.yml +++ b/.github/workflows/open-api.yml @@ -48,7 +48,7 @@ jobs: with: persist-credentials: false - name: Install uv - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 + uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: enable-cache: false - name: Install dependencies From 180e399e195f0a264d6ce7298685ad3ecd13338a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:48:38 -0700 Subject: [PATCH 104/197] Build: Bump org.xerial:sqlite-jdbc from 3.51.3.0 to 3.53.0.0 (#16120) Bumps [org.xerial:sqlite-jdbc](https://github.com/xerial/sqlite-jdbc) from 3.51.3.0 to 3.53.0.0. - [Release notes](https://github.com/xerial/sqlite-jdbc/releases) - [Changelog](https://github.com/xerial/sqlite-jdbc/blob/master/CHANGELOG) - [Commits](https://github.com/xerial/sqlite-jdbc/compare/3.51.3.0...3.53.0.0) --- updated-dependencies: - dependency-name: org.xerial:sqlite-jdbc dependency-version: 3.53.0.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 87ceb9012bd6..55ce8b99e830 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -89,7 +89,7 @@ spark34 = "3.4.4" spark35 = "3.5.8" spark40 = "4.0.2" spark41 = "4.1.1" -sqlite-jdbc = "3.51.3.0" +sqlite-jdbc = "3.53.0.0" testcontainers = "2.0.4" tez08 = { strictly = "0.8.4"} # see rich version usage explanation above From 22918cf30ce51c54032f55fa5989bda53a5870b8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:49:02 -0700 Subject: [PATCH 105/197] Build: Bump github/codeql-action from 4.35.1 to 4.35.2 (#16118) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.35.1 to 4.35.2. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/c10b8064de6f491fea524254123dbe5e09572f13...95e58e9a2cdfd71adc6e0353d5c52f41a045d225) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.35.2 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index fe0459aeb76f..7e9c8208c888 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: category: "/language:actions" From c213f5e96e2bd9ae41e5d85e5493ec5a3d74c290 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:49:16 -0700 Subject: [PATCH 106/197] Build: Bump bouncycastle from 1.82 to 1.84 (#16117) Bumps `bouncycastle` from 1.82 to 1.84. Updates `org.bouncycastle:bcpkix-jdk18on` from 1.82 to 1.84 - [Changelog](https://github.com/bcgit/bc-java/blob/main/docs/releasenotes.html) - [Commits](https://github.com/bcgit/bc-java/commits) Updates `org.bouncycastle:bcprov-jdk18on` from 1.82 to 1.84 - [Changelog](https://github.com/bcgit/bc-java/blob/main/docs/releasenotes.html) - [Commits](https://github.com/bcgit/bc-java/commits) Updates `org.bouncycastle:bcutil-jdk18on` from 1.82 to 1.84 - [Changelog](https://github.com/bcgit/bc-java/blob/main/docs/releasenotes.html) - [Commits](https://github.com/bcgit/bc-java/commits) --- updated-dependencies: - dependency-name: org.bouncycastle:bcpkix-jdk18on dependency-version: '1.84' dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: org.bouncycastle:bcprov-jdk18on dependency-version: '1.84' dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: org.bouncycastle:bcutil-jdk18on dependency-version: '1.84' dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 55ce8b99e830..42b2e033c97c 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -36,7 +36,7 @@ awaitility = "4.3.0" awssdk-bom = "2.42.33" azuresdk-bom = "1.3.6" awssdk-s3accessgrants = "2.4.1" -bouncycastle = "1.82" +bouncycastle = "1.84" bson-ver = "4.11.5" caffeine = "2.9.3" calcite = "1.41.0" From 5acbb7a5a75ed5cccc289dae7336c0731be90597 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:49:32 -0700 Subject: [PATCH 107/197] Build: Bump guava from 33.5.0-jre to 33.6.0-jre (#16116) Bumps `guava` from 33.5.0-jre to 33.6.0-jre. Updates `com.google.guava:guava` from 33.5.0-jre to 33.6.0-jre - [Release notes](https://github.com/google/guava/releases) - [Commits](https://github.com/google/guava/commits) Updates `com.google.guava:guava-testlib` from 33.5.0-jre to 33.6.0-jre - [Release notes](https://github.com/google/guava/releases) - [Commits](https://github.com/google/guava/commits) --- updated-dependencies: - dependency-name: com.google.guava:guava dependency-version: 33.6.0-jre dependency-type: direct:production update-type: version-update:semver-minor - dependency-name: com.google.guava:guava-testlib dependency-version: 33.6.0-jre dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 42b2e033c97c..fad41cf94a23 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -53,7 +53,7 @@ flink20 = { strictly = "2.0.0"} flink21 = { strictly = "2.1.0"} google-libraries-bom = "26.79.0" gcs-analytics-core = "1.2.3" -guava = "33.5.0-jre" +guava = "33.6.0-jre" hadoop3 = "3.4.3" httpcomponents-httpclient5 = "5.6" hive2 = { strictly = "2.3.10"} # see rich version usage explanation above From bd7096ee0699f6dc28f23ea0af84effdf14891de Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 25 Apr 2026 23:49:56 -0700 Subject: [PATCH 108/197] Build: Bump mkdocs-rss-plugin from 1.18.1 to 1.19.0 (#16113) Bumps [mkdocs-rss-plugin](https://github.com/guts/mkdocs-rss-plugin) from 1.18.1 to 1.19.0. - [Release notes](https://github.com/guts/mkdocs-rss-plugin/releases) - [Changelog](https://github.com/Guts/mkdocs-rss-plugin/blob/main/CHANGELOG.md) - [Commits](https://github.com/guts/mkdocs-rss-plugin/compare/1.18.1...1.19.0) --- updated-dependencies: - dependency-name: mkdocs-rss-plugin dependency-version: 1.19.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- site/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/requirements.txt b/site/requirements.txt index 359d2904619e..e21e4c8f1b5c 100644 --- a/site/requirements.txt +++ b/site/requirements.txt @@ -21,5 +21,5 @@ mkdocs-material==9.7.5 mkdocs-material-extensions==1.3.1 mkdocs-monorepo-plugin @ git+https://github.com/bitsondatadev/mkdocs-monorepo-plugin@url-fix mkdocs-redirects==1.2.3 -mkdocs-rss-plugin==1.18.1 +mkdocs-rss-plugin==1.19.0 pymarkdownlnt==0.9.36 From bac514ab4e72f0a75bf045c63a66009a0115022e Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Sun, 26 Apr 2026 09:58:51 -0700 Subject: [PATCH 109/197] Flink 2.1: Remove flink-metrics-dropwizard from runtime (#16093) * Flink 2.1: Remove flink-metrics-dropwizard from runtime. * Flink 2.1: Update runtime-deps.txt. --- flink/v2.1/build.gradle | 3 --- flink/v2.1/flink-runtime/runtime-deps.txt | 2 -- 2 files changed, 5 deletions(-) diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index a08cb1d5ebdd..53f87f27aa67 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -169,9 +169,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // for dropwizard histogram metrics implementation - implementation libs.flink21.metrics.dropwizard - // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') diff --git a/flink/v2.1/flink-runtime/runtime-deps.txt b/flink/v2.1/flink-runtime/runtime-deps.txt index 3dfc56f15ea9..00c53ed388d0 100644 --- a/flink/v2.1/flink-runtime/runtime-deps.txt +++ b/flink/v2.1/flink-runtime/runtime-deps.txt @@ -6,11 +6,9 @@ com.github.luben:zstd-jni:1.5.7-3 com.google.errorprone:error_prone_annotations:2.10.0 dev.failsafe:failsafe:3.3.2 io.airlift:aircompressor:2.0.3 -io.dropwizard.metrics:metrics-core:3.2.6 org.apache.avro:avro:1.12.1 org.apache.datasketches:datasketches-java:6.2.0 org.apache.datasketches:datasketches-memory:3.0.2 -org.apache.flink:flink-metrics-dropwizard:2.1.0 org.apache.httpcomponents.client5:httpclient5:5.6 org.apache.httpcomponents.core5:httpcore5-h2:5.4 org.apache.httpcomponents.core5:httpcore5:5.4 From afb7519eb5d99bef089f535f876d8bae538369e7 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Sun, 26 Apr 2026 23:30:12 -0700 Subject: [PATCH 110/197] AWS Bundle: Exclude logging dependencies (#16105) * AWS Bundle: Exclude log4j. * AWS Bundle: Remove logging Jars from runtime-deps.txt. --- aws-bundle/build.gradle | 14 ++++++++------ aws-bundle/runtime-deps.txt | 4 ---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/aws-bundle/build.gradle b/aws-bundle/build.gradle index c891ac5b439c..541d5ae7a541 100644 --- a/aws-bundle/build.gradle +++ b/aws-bundle/build.gradle @@ -23,6 +23,14 @@ project(":iceberg-aws-bundle") { tasks.jar.dependsOn tasks.shadowJar + configurations { + implementation { + exclude group: 'org.slf4j' + exclude group: 'org.apache.logging.slf4j' + exclude group: 'org.apache.logging.log4j' + } + } + dependencies { implementation platform(libs.awssdk.bom) implementation libs.awssdk.s3accessgrants @@ -52,12 +60,6 @@ project(":iceberg-aws-bundle") { include 'NOTICE' } - dependencies { - exclude(dependency('org.slf4j:.*')) - exclude(dependency('org.apache.logging.log4j:.*')) - exclude(dependency('org.apache.logging.slf4j:.*')) - } - // relocate AWS-specific versions relocate 'org.apache.http', 'org.apache.iceberg.aws.shaded.org.apache.http' relocate 'io.netty', 'org.apache.iceberg.aws.shaded.io.netty' diff --git a/aws-bundle/runtime-deps.txt b/aws-bundle/runtime-deps.txt index 4afb7bde1b32..fc2514d5373d 100644 --- a/aws-bundle/runtime-deps.txt +++ b/aws-bundle/runtime-deps.txt @@ -14,12 +14,8 @@ io.netty:netty-transport-native-unix-common:4.1.132.Final io.netty:netty-transport:4.1.132.Final org.apache.httpcomponents:httpclient:4.5.13 org.apache.httpcomponents:httpcore:4.4.16 -org.apache.logging.log4j:log4j-api:2.20.0 -org.apache.logging.log4j:log4j-core:2.20.0 -org.apache.logging.log4j:log4j-slf4j-impl:2.20.0 org.checkerframework:checker-qual:3.19.0 org.reactivestreams:reactive-streams:1.0.4 -org.slf4j:slf4j-api:2.0.17 software.amazon.awssdk.crt:aws-crt:0.43.9 software.amazon.awssdk:annotations:2.42.33 software.amazon.awssdk:apache-client:2.42.33 From 2a615803ef9b05d00372bd20474a06e0013b8da9 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Mon, 27 Apr 2026 10:13:52 +0200 Subject: [PATCH 111/197] Spark 4.1: Parameterize TestDeleteFrom with format-version (#16098) --- .../iceberg/spark/sql/TestDeleteFrom.java | 65 ++++++++++++++++--- 1 file changed, 55 insertions(+), 10 deletions(-) diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java index 02c5ecd66b80..0d010087cd8b 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestDeleteFrom.java @@ -20,12 +20,20 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.Assumptions.assumeThat; import java.util.List; +import java.util.Map; +import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.RowLevelOperationMode; import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestHelpers; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.CatalogTestBase; import org.apache.iceberg.spark.source.SimpleRecord; @@ -38,14 +46,41 @@ @ExtendWith(ParameterizedTestExtension.class) public class TestDeleteFrom extends CatalogTestBase { + @Parameter(index = 3) + private int formatVersion; + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}, formatVersion = {3}") + protected static Object[][] parameters() { + List parameters = Lists.newArrayList(); + for (Object[] catalogParams : CatalogTestBase.parameters()) { + for (int version : TestHelpers.V2_AND_ABOVE) { + parameters.add( + new Object[] {catalogParams[0], catalogParams[1], catalogParams[2], version}); + } + } + + return parameters.toArray(new Object[0][]); + } + @AfterEach public void removeTables() { sql("DROP TABLE IF EXISTS %s", tableName); } + private String tableProperties() { + return tableProperties(ImmutableMap.of()); + } + + private String tableProperties(Map additionalProperties) { + ImmutableMap.Builder builder = ImmutableMap.builder(); + builder.putAll(additionalProperties); + builder.put(TableProperties.FORMAT_VERSION, String.valueOf(formatVersion)); + return String.format("TBLPROPERTIES (%s)", tablePropsAsString(builder.buildKeepingLast())); + } + @TestTemplate public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); + sql("CREATE TABLE %s (id bigint, data string) USING iceberg %s", tableName, tableProperties()); List records = Lists.newArrayList( @@ -75,7 +110,7 @@ public void testDeleteFromUnpartitionedTable() throws NoSuchTableException { @TestTemplate public void testDeleteFromTableAtSnapshot() throws NoSuchTableException { - sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName); + sql("CREATE TABLE %s (id bigint, data string) USING iceberg %s", tableName, tableProperties()); List records = Lists.newArrayList( @@ -95,8 +130,9 @@ public void testDeleteFromPartitionedTable() throws NoSuchTableException { sql( "CREATE TABLE %s (id bigint, data string) " + "USING iceberg " - + "PARTITIONED BY (truncate(id, 2))", - tableName); + + "PARTITIONED BY (truncate(id, 2)) " + + "%s", + tableName, tableProperties()); List records = Lists.newArrayList( @@ -125,7 +161,9 @@ public void testDeleteFromPartitionedTable() throws NoSuchTableException { @TestTemplate public void testDeleteFromWhereFalse() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg %s", + tableName, tableProperties()); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); assertEquals( @@ -145,7 +183,9 @@ public void testDeleteFromWhereFalse() { @TestTemplate public void testTruncate() { - sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg %s", + tableName, tableProperties()); sql("INSERT INTO TABLE %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", tableName); assertEquals( @@ -167,8 +207,8 @@ public void testTruncate() { @TestTemplate public void testDeleteFromTablePartitionedByVarbinary() { sql( - "CREATE TABLE %s (id bigint NOT NULL, data binary) USING iceberg PARTITIONED BY (data)", - tableName); + "CREATE TABLE %s (id bigint NOT NULL, data binary) USING iceberg PARTITIONED BY (data) %s", + tableName, tableProperties()); sql("INSERT INTO TABLE %s VALUES(1, X'e3bcd1'), (2, X'bcd1')", tableName); assertEquals( @@ -189,9 +229,14 @@ public void testDeleteFromTablePartitionedByVarbinary() { @TestTemplate public void truncateWithDVs() throws NoSuchTableException { + assumeThat(formatVersion).isGreaterThanOrEqualTo(3); + sql( - "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg TBLPROPERTIES ('format-version'='3','write.delete.mode'='merge-on-read')", - tableName); + "CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg %s", + tableName, + tableProperties( + ImmutableMap.of( + TableProperties.DELETE_MODE, RowLevelOperationMode.MERGE_ON_READ.modeName()))); List records = ImmutableList.of( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); From 1b733ed22c8f47965184a48017c5941014def4cf Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Mon, 27 Apr 2026 19:53:38 +0800 Subject: [PATCH 112/197] Core: Fix RejectedExecutionException in InMemoryLockManager when multiple catalogs share default lock manager (#15862) --- .../org/apache/iceberg/util/LockManagers.java | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/util/LockManagers.java b/core/src/main/java/org/apache/iceberg/util/LockManagers.java index 96622cb57f83..561d0a8975dd 100644 --- a/core/src/main/java/org/apache/iceberg/util/LockManagers.java +++ b/core/src/main/java/org/apache/iceberg/util/LockManagers.java @@ -18,11 +18,9 @@ */ package org.apache.iceberg.util; -import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.Executors; -import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; @@ -108,6 +106,11 @@ public int heartbeatThreads() { return heartbeatThreads; } + /** + * Returns the shared scheduler for lock heartbeats. + * + *

Callers must not shut down this scheduler. It is shared across lock manager instances. + */ public ScheduledExecutorService scheduler() { if (scheduler == null) { synchronized (BaseLockManager.class) { @@ -159,16 +162,10 @@ public void initialize(Map properties) { @Override public void close() throws Exception { - if (scheduler != null) { - List tasks = scheduler.shutdownNow(); - tasks.forEach( - task -> { - if (task instanceof Future) { - ((Future) task).cancel(true); - } - }); - scheduler = null; - } + // The scheduler is a shared static resource used across all BaseLockManager instances. + // Individual instances must not shut it down, as other instances may still be using it. + // The scheduler uses daemon threads and will be terminated at JVM exit by the shutdown + // hook registered via MoreExecutors.getExitingScheduledExecutorService. } } From b809dcd770d3c10cc6d81b70dc198422749cfa0e Mon Sep 17 00:00:00 2001 From: Dmitriy Avseitsev Date: Mon, 27 Apr 2026 14:57:29 +0200 Subject: [PATCH 113/197] Core, Catalogs: Add support for unique table locations via catalog property (#12892) --- .../apache/iceberg/aws/glue/GlueTestBase.java | 11 ++ .../aws/glue/TestGlueCatalogTable.java | 19 ++- .../iceberg/aws/dynamodb/DynamoDbCatalog.java | 22 ++- .../apache/iceberg/aws/glue/GlueCatalog.java | 28 +++- .../aws/dynamodb/TestDynamoDbCatalog.java | 60 +++++++- .../iceberg/aws/glue/TestGlueCatalog.java | 22 +++ .../gcp/bigquery/TestBigQueryCatalog.java | 13 ++ .../org/apache/iceberg/CatalogProperties.java | 9 ++ .../iceberg/inmemory/InMemoryCatalog.java | 12 +- .../org/apache/iceberg/jdbc/JdbcCatalog.java | 9 +- .../org/apache/iceberg/util/LocationUtil.java | 24 ++++ .../apache/iceberg/catalog/CatalogTests.java | 82 +++++++++++ .../apache/iceberg/rest/TestRESTCatalog.java | 13 +- .../apache/iceberg/dell/ecs/EcsCatalog.java | 12 +- .../iceberg/dell/ecs/TestEcsCatalog.java | 34 ++++- docs/docs/configuration.md | 1 + .../org/apache/iceberg/hive/HiveCatalog.java | 13 +- .../RESTCompatibilityKitCatalogTests.java | 8 ++ .../iceberg/spark/SparkCatalogConfig.java | 18 ++- .../spark/sql/TestUniqueTableLocation.java | 132 ++++++++++++++++++ 20 files changed, 514 insertions(+), 28 deletions(-) create mode 100644 spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUniqueTableLocation.java diff --git a/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java b/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java index 65e37eba4cd3..b02537bf40b2 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/glue/GlueTestBase.java @@ -65,6 +65,7 @@ public class GlueTestBase { // iceberg static GlueCatalog glueCatalog; static GlueCatalog glueCatalogWithSkipNameValidation; + static GlueCatalog glueCatalogWithUniqueLocation; static Schema schema = new Schema(Types.NestedField.required(1, "c1", Types.StringType.get(), "c1")); @@ -105,6 +106,16 @@ public static void beforeClass() { GLUE, null, ImmutableMap.of()); + + glueCatalogWithUniqueLocation = new GlueCatalog(); + glueCatalogWithUniqueLocation.initialize( + CATALOG_NAME, + TEST_BUCKET_PATH, + awsProperties, + s3FileIOProperties, + GLUE, + null, + true /* uniqTableLocation */); } @AfterAll diff --git a/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java b/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java index 2c9459c5e36c..cb015b79fb9b 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/glue/TestGlueCatalogTable.java @@ -310,6 +310,22 @@ public void testRenameTable() { assertThat(renamedTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); } + @Test + public void testCreateTableInUniqueLocation() { + String namespace = createNamespace(); + String tableName = createTable(namespace); + String newTableName = tableName + "_renamed"; + + glueCatalogWithUniqueLocation.renameTable( + TableIdentifier.of(namespace, tableName), TableIdentifier.of(namespace, newTableName)); + Table renamedTable = + glueCatalogWithUniqueLocation.loadTable(TableIdentifier.of(namespace, newTableName)); + createTable(namespace, tableName); + Table table = glueCatalogWithUniqueLocation.loadTable(TableIdentifier.of(namespace, tableName)); + + assertThat(renamedTable.location()).isNotEqualTo(table.location()); + } + @Test public void testRenameTableFailsToCreateNewTable() { String namespace = createNamespace(); @@ -743,7 +759,8 @@ public void testTableLevelS3Tags() { new AwsProperties(properties), new S3FileIOProperties(properties), GLUE, - null); + null, + false /* uniqTableLocation */); String namespace = createNamespace(); String tableName = getRandomName(); createTable(namespace, tableName); diff --git a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java index 0c991af75076..7c75f99d6d69 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java +++ b/aws/src/main/java/org/apache/iceberg/aws/dynamodb/DynamoDbCatalog.java @@ -53,6 +53,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.util.Tasks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -112,6 +113,7 @@ public class DynamoDbCatalog extends BaseMetastoreCatalog private FileIO fileIO; private CloseableGroup closeableGroup; private Map catalogProperties; + private boolean uniqueTableLocation; public DynamoDbCatalog() {} @@ -123,12 +125,21 @@ public void initialize(String name, Map properties) { properties.get(CatalogProperties.WAREHOUSE_LOCATION), new AwsProperties(properties), AwsClientFactories.from(properties).dynamo(), - initializeFileIO(properties)); + initializeFileIO(properties), + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT)); } @VisibleForTesting void initialize( - String name, String path, AwsProperties properties, DynamoDbClient client, FileIO io) { + String name, + String path, + AwsProperties properties, + DynamoDbClient client, + FileIO io, + boolean uniqTableLocation) { Preconditions.checkArgument( !Strings.isNullOrEmpty(path), "Cannot initialize DynamoDbCatalog because warehousePath must not be null or empty"); @@ -138,6 +149,7 @@ void initialize( this.warehousePath = LocationUtil.stripTrailingSlash(path); this.dynamo = client; this.fileIO = io; + this.uniqueTableLocation = uniqTableLocation; this.closeableGroup = new CloseableGroup(); closeableGroup.addCloseable(dynamo); @@ -177,12 +189,12 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { } String defaultLocationCol = toPropertyCol(PROPERTY_DEFAULT_LOCATION); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); if (response.item().containsKey(defaultLocationCol)) { - return String.format( - "%s/%s", response.item().get(defaultLocationCol).s(), tableIdentifier.name()); + return String.format("%s/%s", response.item().get(defaultLocationCol).s(), tableLocation); } else { return String.format( - "%s/%s.db/%s", warehousePath, tableIdentifier.namespace(), tableIdentifier.name()); + "%s/%s.db/%s", warehousePath, tableIdentifier.namespace(), tableLocation); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java index 47807a2b9f37..94e53cc1ab69 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java +++ b/aws/src/main/java/org/apache/iceberg/aws/glue/GlueCatalog.java @@ -89,6 +89,7 @@ public class GlueCatalog extends BaseMetastoreCatalog private Object hadoopConf; private String catalogName; private String warehousePath; + private boolean uniqueTableLocation; private AwsProperties awsProperties; private S3FileIOProperties s3FileIOProperties; private LockManager lockManager; @@ -144,7 +145,11 @@ public void initialize(String name, Map properties) { new AwsProperties(properties), new S3FileIOProperties(properties), awsClientFactory.glue(), - initializeLockManager(properties)); + initializeLockManager(properties), + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT)); } private LockManager initializeLockManager(Map properties) { @@ -172,7 +177,17 @@ void initialize( LockManager lock, Map catalogProps) { this.catalogProperties = catalogProps; - initialize(name, path, properties, s3Properties, client, lock); + initialize( + name, + path, + properties, + s3Properties, + client, + lock, + PropertyUtil.propertyAsBoolean( + catalogProps, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT)); } @VisibleForTesting @@ -182,13 +197,15 @@ void initialize( AwsProperties properties, S3FileIOProperties s3Properties, GlueClient client, - LockManager lock) { + LockManager lock, + boolean uniqTableLocation) { this.catalogName = name; this.awsProperties = properties; this.s3FileIOProperties = s3Properties; this.warehousePath = Strings.isNullOrEmpty(path) ? null : LocationUtil.stripTrailingSlash(path); this.glue = client; this.lockManager = lock; + this.uniqueTableLocation = uniqTableLocation; this.closeableGroup = new CloseableGroup(); this.fileIOTracker = new FileIOTracker(); @@ -278,9 +295,10 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { tableIdentifier, awsProperties.glueCatalogSkipNameValidation())) .build()); String dbLocationUri = response.database().locationUri(); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); if (dbLocationUri != null) { dbLocationUri = LocationUtil.stripTrailingSlash(dbLocationUri); - return String.format("%s/%s", dbLocationUri, tableIdentifier.name()); + return String.format("%s/%s", dbLocationUri, tableLocation); } ValidationException.check( @@ -292,7 +310,7 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { warehousePath, IcebergToGlueConverter.getDatabaseName( tableIdentifier, awsProperties.glueCatalogSkipNameValidation()), - tableIdentifier.name()); + tableLocation); } @Override diff --git a/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java b/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java index b602cea303d8..e172831a2428 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java +++ b/aws/src/test/java/org/apache/iceberg/aws/dynamodb/TestDynamoDbCatalog.java @@ -49,14 +49,25 @@ public class TestDynamoDbCatalog { public void before() { dynamo = Mockito.mock(DynamoDbClient.class); dynamoCatalog = new DynamoDbCatalog(); - dynamoCatalog.initialize(CATALOG_NAME, WAREHOUSE_PATH, new AwsProperties(), dynamo, null); + dynamoCatalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + dynamo, + null, + false /* uniqTableLocation */); } @Test public void testConstructorWarehousePathWithEndSlash() { DynamoDbCatalog catalogWithSlash = new DynamoDbCatalog(); catalogWithSlash.initialize( - CATALOG_NAME, WAREHOUSE_PATH + "/", new AwsProperties(), dynamo, null); + CATALOG_NAME, + WAREHOUSE_PATH + "/", + new AwsProperties(), + dynamo, + null, + false /* uniqTableLocation */); Mockito.doReturn(GetItemResponse.builder().item(Maps.newHashMap()).build()) .when(dynamo) .getItem(any(GetItemRequest.class)); @@ -103,4 +114,49 @@ public void testDefaultWarehouseLocationNoNamespace() { .isInstanceOf(NoSuchNamespaceException.class) .hasMessageContaining("Cannot find default warehouse location:"); } + + @Test + public void testDefaultWarehouseLocationUniqueWithoutDbUri() throws Exception { + try (DynamoDbCatalog catalog = new DynamoDbCatalog()) { + catalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + dynamo, + null, + true /* uniqTableLocation */); + Mockito.doReturn(GetItemResponse.builder().item(Maps.newHashMap()).build()) + .when(dynamo) + .getItem(any(GetItemRequest.class)); + + String defaultWarehouseLocation = catalog.defaultWarehouseLocation(TABLE_IDENTIFIER); + assertThat(defaultWarehouseLocation).matches(WAREHOUSE_PATH + "/db.db/table-[a-z0-9]{32}"); + } + } + + @Test + public void testDefaultWarehouseLocationUniqueWithDbUri() throws Exception { + try (DynamoDbCatalog catalog = new DynamoDbCatalog()) { + catalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + dynamo, + null, + true /* uniqTableLocation */); + String dbUri = "s3://bucket2/db"; + Mockito.doReturn( + GetItemResponse.builder() + .item( + ImmutableMap.of( + toPropertyCol(DynamoDbCatalog.defaultLocationProperty()), + AttributeValue.builder().s(dbUri).build())) + .build()) + .when(dynamo) + .getItem(any(GetItemRequest.class)); + + String defaultWarehouseLocation = catalog.defaultWarehouseLocation(TABLE_IDENTIFIER); + assertThat(defaultWarehouseLocation).matches("s3://bucket2/db/table-[a-z0-9]{32}"); + } + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java index 2042948eb3c9..82f7e84d563b 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java +++ b/aws/src/test/java/org/apache/iceberg/aws/glue/TestGlueCatalog.java @@ -194,6 +194,28 @@ public void testDefaultWarehouseLocationCustomCatalogId() { Mockito.argThat((GetDatabaseRequest req) -> req.catalogId().equals(catalogId))); } + @Test + public void testDefaultWarehouseLocationUnique() { + GlueCatalog catalog = new GlueCatalog(); + catalog.initialize( + CATALOG_NAME, + WAREHOUSE_PATH, + new AwsProperties(), + new S3FileIOProperties(), + glue, + LockManagers.defaultLockManager(), + true /* uniqTableLocation */); + + Mockito.doReturn( + GetDatabaseResponse.builder() + .database(Database.builder().name("db").locationUri("s3://bucket2/db").build()) + .build()) + .when(glue) + .getDatabase(Mockito.any(GetDatabaseRequest.class)); + String location = catalog.defaultWarehouseLocation(TableIdentifier.of("db", "table")); + assertThat(location).matches("s3://bucket2/db/table-[a-z0-9]{32}"); + } + @Test public void testListTables() { Mockito.doReturn( diff --git a/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java b/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java index cdeaa1ef1e63..23441d0db184 100644 --- a/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java +++ b/bigquery/src/test/java/org/apache/iceberg/gcp/bigquery/TestBigQueryCatalog.java @@ -24,6 +24,7 @@ import static org.assertj.core.api.Assertions.assertThat; import java.io.File; +import java.io.IOException; import java.util.List; import java.util.Map; import org.apache.iceberg.CatalogProperties; @@ -169,6 +170,18 @@ public void testRenameTableMissingSourceTable() { super.testRenameTableMissingSourceTable(); } + @Disabled("BigQuery Metastore does not support rename tables") + @Test + public void createTableInUniqueLocation() { + super.createTableInUniqueLocation(); + } + + @Disabled("BigQuery Metastore does not support rename tables") + @Test + public void dropAfterRenameDoesntCorruptTable() throws IOException { + super.dropAfterRenameDoesntCorruptTable(); + } + @Test public void testIsValidIdentifierWithValidSingleLevelNamespace() { assertThat(catalog.isValidIdentifier(TableIdentifier.of("dataset1", "table1"))).isTrue(); diff --git a/core/src/main/java/org/apache/iceberg/CatalogProperties.java b/core/src/main/java/org/apache/iceberg/CatalogProperties.java index 59744e50924f..6b85ccbc87bc 100644 --- a/core/src/main/java/org/apache/iceberg/CatalogProperties.java +++ b/core/src/main/java/org/apache/iceberg/CatalogProperties.java @@ -158,6 +158,15 @@ private CatalogProperties() {} public static final String APP_NAME = "app-name"; public static final String USER = "user"; + /** + * Requests that the catalog provide unique locations for new tables. + * + *

Relevant only for catalogs which support unique table locations. + */ + public static final String UNIQUE_TABLE_LOCATION = "unique-table-location"; + + public static final boolean UNIQUE_TABLE_LOCATION_DEFAULT = false; + public static final String AUTH_SESSION_TIMEOUT_MS = "auth.session-timeout-ms"; public static final long AUTH_SESSION_TIMEOUT_MS_DEFAULT = TimeUnit.HOURS.toMillis(1); diff --git a/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java b/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java index 975b5a39dfe3..55c982f3d625 100644 --- a/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java +++ b/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java @@ -48,6 +48,8 @@ import org.apache.iceberg.relocated.com.google.common.base.Objects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.view.BaseMetastoreViewCatalog; import org.apache.iceberg.view.BaseViewOperations; import org.apache.iceberg.view.ViewMetadata; @@ -71,6 +73,7 @@ public class InMemoryCatalog extends BaseMetastoreViewCatalog private String catalogName; private String warehouseLocation; private CloseableGroup closeableGroup; + private boolean uniqueTableLocation; private Map catalogProperties; public InMemoryCatalog() { @@ -88,6 +91,11 @@ public String name() { public void initialize(String name, Map properties) { this.catalogName = name != null ? name : InMemoryCatalog.class.getSimpleName(); this.catalogProperties = ImmutableMap.copyOf(properties); + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); String warehouse = properties.getOrDefault(CatalogProperties.WAREHOUSE_LOCATION, ""); this.warehouseLocation = warehouse.replaceAll("/*$", ""); @@ -104,8 +112,8 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { @Override protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { - return SLASH.join( - defaultNamespaceLocation(tableIdentifier.namespace()), tableIdentifier.name()); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); + return SLASH.join(defaultNamespaceLocation(tableIdentifier.namespace()), tableLocation); } private String defaultNamespaceLocation(Namespace namespace) { diff --git a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java index 55c00319a0cc..007821da39fe 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java @@ -88,6 +88,7 @@ public class JdbcCatalog extends BaseMetastoreViewCatalog private Object conf; private JdbcClientPool connections; private Map catalogProperties; + private boolean uniqueTableLocation; private final Function, FileIO> ioBuilder; private final Function, JdbcClientPool> clientPoolBuilder; private boolean initializeCatalogTables; @@ -120,6 +121,11 @@ public void initialize(String name, Map properties) { this.warehouseLocation = LocationUtil.stripTrailingSlash(inputWarehouseLocation); this.catalogProperties = ImmutableMap.copyOf(properties); + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); if (name != null) { this.catalogName = name; @@ -287,7 +293,8 @@ protected ViewOperations newViewOps(TableIdentifier viewIdentifier) { @Override protected String defaultWarehouseLocation(TableIdentifier table) { - return SLASH.join(defaultNamespaceLocation(table.namespace()), table.name()); + String tableLocation = LocationUtil.tableLocation(table, uniqueTableLocation); + return SLASH.join(defaultNamespaceLocation(table.namespace()), tableLocation); } @Override diff --git a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java index 400307149238..4c0d401c74b9 100644 --- a/core/src/main/java/org/apache/iceberg/util/LocationUtil.java +++ b/core/src/main/java/org/apache/iceberg/util/LocationUtil.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.util; +import java.util.UUID; +import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.base.Strings; @@ -33,4 +35,26 @@ public static String stripTrailingSlash(String path) { } return result; } + + /** + * Returns a path component derived from the {@code tableIdentifier}, used as part of the table + * location URI. + * + *

If {@code useUniqueLocation} is {@code true}, the returned component will include a random + * UUID suffix. Otherwise, the plain table name is returned. + * + * @param tableIdentifier Iceberg table identifier + * @param useUniqueLocation whether to ensure uniqueness + * @return a string representing the table name component for a location URI + */ + public static String tableLocation(TableIdentifier tableIdentifier, boolean useUniqueLocation) { + Preconditions.checkArgument(null != tableIdentifier, "Invalid identifier: null"); + + if (useUniqueLocation) { + String uniqueSuffix = UUID.randomUUID().toString().replace("-", ""); + return String.format("%s-%s", tableIdentifier.name(), uniqueSuffix); + } else { + return tableIdentifier.name(); + } + } } diff --git a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java index 833b2fb0b46f..9053f21ea112 100644 --- a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java +++ b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java @@ -72,6 +72,8 @@ import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.expressions.Literal; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.InputFile; import org.apache.iceberg.metrics.CommitReport; import org.apache.iceberg.metrics.MetricsReport; import org.apache.iceberg.metrics.MetricsReporter; @@ -1025,6 +1027,86 @@ public void testRenameTable() { assertEmpty("Should not contain table after drop", catalog, NS); } + @Test + public void createTableInUniqueLocation() { + Map additionalProperties = + ImmutableMap.of(CatalogProperties.UNIQUE_TABLE_LOCATION, "true"); + C catalog = initCatalog("uniq_path_catalog", additionalProperties); + + if (requiresNamespaceCreate()) { + catalog.createNamespace(NS); + } + + catalog.createTable(TABLE, SCHEMA, PartitionSpec.unpartitioned()); + catalog.renameTable(TABLE, RENAMED_TABLE); + catalog.createTable(TABLE, SCHEMA, PartitionSpec.unpartitioned()); + + Table table = catalog.loadTable(TABLE); + Table renamedTable = catalog.loadTable(RENAMED_TABLE); + + assertThat(table.location()) + .as("Tables %s and %s have different location", TABLE, RENAMED_TABLE) + .isNotEqualTo(renamedTable.location()); + } + + @Test + public void dropAfterRenameDoesntCorruptTable() throws IOException { + C catalog = catalog(); + + if (requiresNamespaceCreate()) { + catalog.createNamespace(TABLE.namespace()); + } + + PartitionSpec spec = PartitionSpec.unpartitioned(); + + Table initialTable = catalog.createTable(TABLE, SCHEMA, spec); + String initialFilePath = initialTable.locationProvider().newDataLocation("data-a.parquet"); + DataFile dataFile = + DataFiles.builder(spec) + .withPath(initialFilePath) + .withFileSizeInBytes(10) + .withRecordCount(2) + .build(); + initialTable.io().newOutputFile(initialFilePath).create().close(); + initialTable.newAppend().appendFile(dataFile).commit(); + + catalog.renameTable(TABLE, RENAMED_TABLE); + + Table newTable = catalog.createTable(TABLE, SCHEMA, spec); + String newFilePath = newTable.locationProvider().newDataLocation("data-b.parquet"); + DataFile anotherFile = + DataFiles.builder(spec) + .withPath(newFilePath) + .withFileSizeInBytes(10) + .withRecordCount(2) + .build(); + newTable.io().newOutputFile(newFilePath).create().close(); + newTable.newAppend().appendFile(anotherFile).commit(); + + catalog.dropTable(RENAMED_TABLE, true); + + assertThat(catalog.tableExists(RENAMED_TABLE)) + .as("After PURGE, %s must not exist", RENAMED_TABLE) + .isFalse(); + assertThat(catalog.tableExists(TABLE)) + .as( + "After dropping the renamed table with PURGE, the recreated table with the original name (%s) must exist", + TABLE) + .isTrue(); + + Table table = catalog.loadTable(TABLE); + FileIO io = table.io(); + try (CloseableIterable tasks = table.newScan().planFiles()) { + tasks.forEach( + task -> { + InputFile file = io.newInputFile(task.file().location()); + assertThat(file.exists()) + .as("Table %s should remain unaffected by dropping %s", TABLE, RENAMED_TABLE) + .isTrue(); + }); + } + } + @Test public void testRenameTableMissingSourceTable() { C catalog = catalog(); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java index c1630451a33e..e4fa156059d8 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java @@ -274,12 +274,7 @@ protected T execute( @BeforeEach public void createCatalog() throws Exception { - File warehouse = temp.toFile(); - this.backendCatalog = new InMemoryCatalog(); - this.backendCatalog.initialize( - "in-memory", - ImmutableMap.of(CatalogProperties.WAREHOUSE_LOCATION, warehouse.getAbsolutePath())); HTTPHeaders catalogHeaders = HTTPHeaders.of( @@ -317,6 +312,14 @@ public void createCatalog() throws Exception { @Override protected RESTCatalog initCatalog(String catalogName, Map additionalProperties) { Configuration conf = new Configuration(); + File warehouse = temp.toFile(); + + backendCatalog.initialize( + "in-memory", + ImmutableMap.builder() + .put(CatalogProperties.WAREHOUSE_LOCATION, warehouse.getAbsolutePath()) + .putAll(additionalProperties) + .build()); RESTCatalog catalog = new RESTCatalog( diff --git a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java index 07ad68365837..bb8150d16dca 100644 --- a/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java +++ b/dell/src/main/java/org/apache/iceberg/dell/ecs/EcsCatalog.java @@ -59,6 +59,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.io.ByteStreams; import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -86,6 +87,7 @@ public class EcsCatalog extends BaseMetastoreCatalog private FileIO fileIO; private CloseableGroup closeableGroup; private Map catalogProperties; + private boolean uniqueTableLocation; /** * No-arg constructor to load the catalog dynamically. @@ -102,6 +104,12 @@ public void initialize(String name, Map properties) { !Strings.isNullOrEmpty(inputWarehouseLocation), "Cannot initialize EcsCatalog because warehousePath must not be null or empty"); + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); + this.catalogName = name; this.warehouseLocation = new EcsURI(LocationUtil.stripTrailingSlash(inputWarehouseLocation)); this.client = DellClientFactories.from(properties).ecsS3(); @@ -136,8 +144,8 @@ protected TableOperations newTableOps(TableIdentifier tableIdentifier) { @Override protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { - return String.format( - "%s/%s", namespacePrefix(tableIdentifier.namespace()), tableIdentifier.name()); + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); + return String.format("%s/%s", namespacePrefix(tableIdentifier.namespace()), tableLocation); } /** Iterate all table objects with the namespace prefix. */ diff --git a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java index 4714d37d72b9..82549f1eccd9 100644 --- a/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java +++ b/dell/src/test/java/org/apache/iceberg/dell/ecs/TestEcsCatalog.java @@ -55,11 +55,17 @@ public class TestEcsCatalog { @BeforeEach public void before() { - ecsCatalog = new EcsCatalog(); + ecsCatalog = createCatalog("test", ImmutableMap.of()); + } + + private EcsCatalog createCatalog(String name, Map additionalProperties) { + EcsCatalog catalog = new EcsCatalog(); Map properties = Maps.newHashMap(); properties.put(CatalogProperties.WAREHOUSE_LOCATION, new EcsURI(rule.bucket(), "").location()); properties.putAll(rule.clientProperties()); - ecsCatalog.initialize("test", properties); + properties.putAll(additionalProperties); + catalog.initialize(name, properties); + return catalog; } @AfterEach @@ -172,6 +178,30 @@ public void testRenameTable() { .isTrue(); } + @Test + public void testCreateTableInUniqueLocation() throws Exception { + try (EcsCatalog catalog = + createCatalog( + "unique_location_catalog", + ImmutableMap.of(CatalogProperties.UNIQUE_TABLE_LOCATION, "true"))) { + + Namespace ns = Namespace.of("a"); + TableIdentifier tableIdent = TableIdentifier.of(ns, "t1"); + TableIdentifier renamedIdent = TableIdentifier.of(ns, "t2"); + + catalog.createNamespace(ns); + catalog.createTable(tableIdent, SCHEMA); + catalog.renameTable(tableIdent, renamedIdent); + + Table table = catalog.createTable(tableIdent, SCHEMA); + Table renamedTable = catalog.loadTable(renamedIdent); + + assertThat(table.location()) + .as("Should have a different table location") + .isNotEqualTo(renamedTable.location()); + } + } + @Test public void testRegisterTable() { TableIdentifier identifier = TableIdentifier.of("a", "t1"); diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index f12bcea6afd5..9dd2e64390e4 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -155,6 +155,7 @@ Iceberg catalogs support using catalog properties to configure catalog behaviors | cache-enabled | true | Whether to cache catalog entries | | cache.expiration-interval-ms | 30000 | How long catalog entries are locally cached, in milliseconds; 0 disables caching, negative values disable expiration | | metrics-reporter-impl | org.apache.iceberg.metrics.LoggingMetricsReporter | Custom `MetricsReporter` implementation to use in a catalog. See the [Metrics reporting](metrics-reporting.md) section for additional details | +| unique-table-location | false | Whether to use a unique location for new tables | | encryption.kms-impl | null | a custom `KeyManagementClient` implementation to use in a catalog for interactions with KMS (key management service). See the [Encryption](encryption.md) document for additional details | `HadoopCatalog` and `HiveCatalog` can access the properties in their constructors. diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java index 93267716db66..4d881c515d48 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveCatalog.java @@ -65,6 +65,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.util.LocationUtil; +import org.apache.iceberg.util.PropertyUtil; import org.apache.iceberg.view.BaseMetastoreViewCatalog; import org.apache.iceberg.view.View; import org.apache.iceberg.view.ViewBuilder; @@ -94,6 +95,7 @@ public class HiveCatalog extends BaseMetastoreViewCatalog private KeyManagementClient keyManagementClient; private ClientPool clients; private boolean listAllTables = false; + private boolean uniqueTableLocation; private Map catalogProperties; public HiveCatalog() {} @@ -131,6 +133,12 @@ public void initialize(String inputName, Map properties) { this.keyManagementClient = EncryptionUtil.createKmsClient(properties); } + this.uniqueTableLocation = + PropertyUtil.propertyAsBoolean( + properties, + CatalogProperties.UNIQUE_TABLE_LOCATION, + CatalogProperties.UNIQUE_TABLE_LOCATION_DEFAULT); + this.clients = new CachedClientPool(conf, properties); } @@ -708,13 +716,14 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { // - Create the metadata in HMS, and this way committing the changes // Create a new location based on the namespace / database if it is set on database level + String tableLocation = LocationUtil.tableLocation(tableIdentifier, uniqueTableLocation); try { Database databaseData = clients.run(client -> client.getDatabase(tableIdentifier.namespace().levels()[0])); if (databaseData.getLocationUri() != null) { // If the database location is set use it as a base. String databaseLocation = LocationUtil.stripTrailingSlash(databaseData.getLocationUri()); - return String.format("%s/%s", databaseLocation, tableIdentifier.name()); + return String.format("%s/%s", databaseLocation, tableLocation); } } catch (NoSuchObjectException e) { @@ -731,7 +740,7 @@ protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { // Otherwise, stick to the {WAREHOUSE_DIR}/{DB_NAME}.db/{TABLE_NAME} path String databaseLocation = databaseLocation(tableIdentifier.namespace().levels()[0]); - return String.format("%s/%s", databaseLocation, tableIdentifier.name()); + return String.format("%s/%s", databaseLocation, tableLocation); } private String databaseLocation(String databaseName) { diff --git a/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java b/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java index 9a1f86706db6..cbf752e484bd 100644 --- a/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java +++ b/open-api/src/test/java/org/apache/iceberg/rest/RESTCompatibilityKitCatalogTests.java @@ -26,6 +26,8 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -106,4 +108,10 @@ protected boolean supportsNamesWithSlashes() { // for additional details return false; } + + @Disabled("RESTServerExtension isn’t configurable per test") + @Test + public void createTableInUniqueLocation() { + super.createTableInUniqueLocation(); + } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java index a9fbee2fc262..b20c87619ed8 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/SparkCatalogConfig.java @@ -65,7 +65,23 @@ public enum SparkCatalogConfig { SPARK_WITH_HIVE_VIEWS( "spark_hive_with_views", SparkCatalog.class.getName(), - ImmutableMap.of("type", "hive", "default-namespace", "default", "cache-enabled", "false")); + ImmutableMap.of("type", "hive", "default-namespace", "default", "cache-enabled", "false")), + SPARK_SESSION_WITH_UNIQUE_LOCATION( + "spark_catalog", + SparkSessionCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "parquet-enabled", "true", + "unique-table-location", "true", + "cache-enabled", "false")), + HIVE_WITH_UNIQUE_LOCATION( + "hive_with_unique_location", + SparkCatalog.class.getName(), + ImmutableMap.of( + "type", "hive", + "default-namespace", "default", + "unique-table-location", "true")); private final String catalogName; private final String implementation; diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUniqueTableLocation.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUniqueTableLocation.java new file mode 100644 index 000000000000..c61bb3b0008e --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestUniqueTableLocation.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.sql; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.UUID; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Table; +import org.apache.iceberg.actions.DeleteOrphanFiles; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.exceptions.NotFoundException; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.actions.SparkActions; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestUniqueTableLocation extends CatalogTestBase { + + private String renamedTableName; + private TableIdentifier renamedIdent; + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + protected static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HIVE_WITH_UNIQUE_LOCATION.catalogName(), + SparkCatalogConfig.HIVE_WITH_UNIQUE_LOCATION.implementation(), + SparkCatalogConfig.HIVE_WITH_UNIQUE_LOCATION.properties() + }, + { + SparkCatalogConfig.SPARK_SESSION_WITH_UNIQUE_LOCATION.catalogName(), + SparkCatalogConfig.SPARK_SESSION_WITH_UNIQUE_LOCATION.implementation(), + SparkCatalogConfig.SPARK_SESSION_WITH_UNIQUE_LOCATION.properties() + }, + }; + } + + @BeforeEach + public void initTableName() { + renamedTableName = tableName("table_2"); + renamedIdent = TableIdentifier.of(Namespace.of("default"), "table_2"); + } + + @AfterEach + public void dropTestTable() { + try { + sql("DROP TABLE IF EXISTS %s", tableName); + sql("DROP TABLE IF EXISTS %s", renamedTableName); + } catch (NotFoundException ignore) { + // Swallow FNF exception in case of corrupted table so test failure reason is clearer + } + } + + @TestTemplate + public void noCollisionAfterRename() { + assertThat(validationCatalog.tableExists(tableIdent)) + .as("%s should not exist", tableIdent) + .isFalse(); + assertThat(validationCatalog.tableExists(renamedIdent)) + .as("%s should not exist", renamedIdent) + .isFalse(); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + + sql("ALTER TABLE %s RENAME TO %s", tableName, renamedTableName); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + + Table table = validationCatalog.loadTable(tableIdent); + Table renamedTable = validationCatalog.loadTable(renamedIdent); + + assertThat(table.location()) + .as( + "After rename+recreate, %s and %s must have different locations", + tableName, renamedTableName) + .isNotEqualTo(renamedTable.location()); + } + + @TestTemplate + public void orphanCleanupDoesntCorruptTable() { + SparkActions actions = SparkActions.get(); + + assertThat(validationCatalog.tableExists(tableIdent)) + .as("%s should not exist", tableIdent) + .isFalse(); + assertThat(validationCatalog.tableExists(renamedIdent)) + .as("%s should not exist", renamedIdent) + .isFalse(); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES(0, '%s')", tableName, UUID.randomUUID().toString()); + + sql("ALTER TABLE %s RENAME TO %s", tableName, renamedTableName); + + sql("CREATE TABLE %s (id BIGINT NOT NULL, data STRING) USING iceberg", tableName); + sql("INSERT INTO %s VALUES(1, '%s')", tableName, UUID.randomUUID().toString()); + + Table table = validationCatalog.loadTable(tableIdent); + assertThat(table).as("Should load %s", table).isNotNull(); + + long cutoff = System.currentTimeMillis() + 1; + DeleteOrphanFiles.Result result = actions.deleteOrphanFiles(table).olderThan(cutoff).execute(); + assertThat(result.orphanFileLocations()).as("Should not touch any files").isEmpty(); + + assertThat(scalarSql("SELECT count(*) FROM %s", renamedTableName)) + .as("Table %s should remain unaffected by %s table cleanup", renamedTableName, tableName) + .isEqualTo(1L); + } +} From 8d0508308cd068778897c06d3a802740ed4305a6 Mon Sep 17 00:00:00 2001 From: Harrison Crosse Date: Mon, 27 Apr 2026 08:59:31 -0400 Subject: [PATCH 114/197] Parquet: Add write.parquet.page-version table property (#15700) --- .../org/apache/iceberg/TableProperties.java | 4 + docs/docs/configuration.md | 1 + .../org/apache/iceberg/parquet/Parquet.java | 53 +++- .../iceberg/parquet/ParquetFormatModel.java | 6 - .../parquet/TestDictionaryRowGroupFilter.java | 2 +- .../parquet/TestParquetPageVersion.java | 251 ++++++++++++++++++ 6 files changed, 296 insertions(+), 21 deletions(-) create mode 100644 parquet/src/test/java/org/apache/iceberg/parquet/TestParquetPageVersion.java diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 71991f633d97..7100daef437e 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -135,6 +135,10 @@ private TableProperties() {} "write.delete.parquet.page-size-bytes"; public static final int PARQUET_PAGE_SIZE_BYTES_DEFAULT = 1024 * 1024; // 1 MB + public static final String PARQUET_PAGE_VERSION = "write.parquet.page-version"; + public static final String DELETE_PARQUET_PAGE_VERSION = "write.delete.parquet.page-version"; + public static final String PARQUET_PAGE_VERSION_DEFAULT = "v1"; + public static final String PARQUET_PAGE_ROW_LIMIT = "write.parquet.page-row-limit"; public static final String DELETE_PARQUET_PAGE_ROW_LIMIT = "write.delete.parquet.page-row-limit"; public static final int PARQUET_PAGE_ROW_LIMIT_DEFAULT = 20_000; diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index 9dd2e64390e4..c1bdc80d11bc 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -45,6 +45,7 @@ Iceberg tables support table properties to configure table behavior, like the de | write.delete.format.default | data file format | Default delete file format for the table; parquet, avro, or orc | | write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | | write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | +| write.parquet.page-version | v1 | Parquet data page version: v1 (DataPage V1) or v2 (DataPage V2) | | write.parquet.page-row-limit | 20000 | Parquet page row limit | | write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | | write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index 2387d52edf2f..f02974d6e79c 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -23,6 +23,7 @@ import static org.apache.iceberg.TableProperties.DELETE_PARQUET_DICT_SIZE_BYTES; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_ROW_LIMIT; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_SIZE_BYTES; +import static org.apache.iceberg.TableProperties.DELETE_PARQUET_PAGE_VERSION; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; import static org.apache.iceberg.TableProperties.DELETE_PARQUET_ROW_GROUP_SIZE_BYTES; @@ -42,6 +43,8 @@ import static org.apache.iceberg.TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT; import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES; import static org.apache.iceberg.TableProperties.PARQUET_PAGE_SIZE_BYTES_DEFAULT; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_VERSION; +import static org.apache.iceberg.TableProperties.PARQUET_PAGE_VERSION_DEFAULT; import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT; import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MAX_RECORD_COUNT_DEFAULT; import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT; @@ -95,7 +98,6 @@ import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.parquet.ParquetValueWriters.PositionDeleteStructWriter; import org.apache.iceberg.parquet.ParquetValueWriters.StructWriter; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Maps; @@ -168,7 +170,6 @@ public static class WriteBuilder implements InternalData.WriteBuilder { private BiFunction> createWriterFunc = null; private MetricsConfig metricsConfig = MetricsConfig.getDefault(); private ParquetFileWriter.Mode writeMode = ParquetFileWriter.Mode.CREATE; - private WriterVersion writerVersion = WriterVersion.PARQUET_1_0; private Function, Context> createContextFunc = Context::dataContext; private ByteBuffer fileEncryptionKey = null; private ByteBuffer fileAADPrefix = null; @@ -266,7 +267,12 @@ public WriteBuilder overwrite(boolean enabled) { } public WriteBuilder writerVersion(WriterVersion version) { - this.writerVersion = version; + Preconditions.checkNotNull(version, "Writer version cannot be null"); + Preconditions.checkArgument( + version == WriterVersion.PARQUET_1_0 || version == WriterVersion.PARQUET_2_0, + "Unsupported writer version: %s", + version); + config.put(PARQUET_PAGE_VERSION, version.name()); return this; } @@ -292,15 +298,6 @@ private WriteSupport getWriteSupport(MessageType type) { } } - /* - * Sets the writer version. Default value is PARQUET_1_0 (v1). - */ - @VisibleForTesting - WriteBuilder withWriterVersion(WriterVersion version) { - this.writerVersion = version; - return this; - } - // supposed to always be a private method used strictly by data and delete write builders WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) { this.createContextFunc = newCreateContextFunc; @@ -433,7 +430,7 @@ public FileAppender build() throws IOException { ParquetProperties.Builder propsBuilder = ParquetProperties.builder() - .withWriterVersion(writerVersion) + .withWriterVersion(context.writerVersion()) .withPageSize(pageSize) .withPageRowCountLimit(pageRowLimit) .withDictionaryEncoding(dictionaryEnabled) @@ -469,7 +466,7 @@ public FileAppender build() throws IOException { } else { ParquetWriteBuilder parquetWriteBuilder = new ParquetWriteBuilder(ParquetIO.file(file)) - .withWriterVersion(writerVersion) + .withWriterVersion(context.writerVersion()) .setType(type) .setConfig(config) .setKeyValueMetadata(metadata) @@ -502,6 +499,7 @@ static class Context { private final int pageSize; private final int pageRowLimit; private final int dictionaryPageSize; + private final WriterVersion writerVersion; private final CompressionCodecName codec; private final String compressionLevel; private final int rowGroupCheckMinRecordCount; @@ -518,6 +516,7 @@ private Context( int pageSize, int pageRowLimit, int dictionaryPageSize, + WriterVersion writerVersion, CompressionCodecName codec, String compressionLevel, int rowGroupCheckMinRecordCount, @@ -532,6 +531,7 @@ private Context( this.pageSize = pageSize; this.pageRowLimit = pageRowLimit; this.dictionaryPageSize = dictionaryPageSize; + this.writerVersion = writerVersion; this.codec = codec; this.compressionLevel = compressionLevel; this.rowGroupCheckMinRecordCount = rowGroupCheckMinRecordCount; @@ -565,6 +565,10 @@ static Context dataContext(Map config) { config, PARQUET_DICT_SIZE_BYTES, PARQUET_DICT_SIZE_BYTES_DEFAULT); Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0"); + WriterVersion writerVersion = + toWriterVersion( + config.getOrDefault(PARQUET_PAGE_VERSION, PARQUET_PAGE_VERSION_DEFAULT)); + String codecAsString = config.getOrDefault(PARQUET_COMPRESSION, PARQUET_COMPRESSION_DEFAULT); CompressionCodecName codec = toCodec(codecAsString); @@ -616,6 +620,7 @@ static Context dataContext(Map config) { pageSize, pageRowLimit, dictionaryPageSize, + writerVersion, codec, compressionLevel, rowGroupCheckMinRecordCount, @@ -652,6 +657,12 @@ static Context deleteContext(Map config) { config, DELETE_PARQUET_DICT_SIZE_BYTES, dataContext.dictionaryPageSize()); Preconditions.checkArgument(dictionaryPageSize > 0, "Dictionary page size must be > 0"); + String deletePageVersion = config.get(DELETE_PARQUET_PAGE_VERSION); + WriterVersion writerVersion = + deletePageVersion != null + ? toWriterVersion(deletePageVersion) + : dataContext.writerVersion(); + String codecAsString = config.get(DELETE_PARQUET_COMPRESSION); CompressionCodecName codec = codecAsString != null ? toCodec(codecAsString) : dataContext.codec(); @@ -686,6 +697,7 @@ static Context deleteContext(Map config) { pageSize, pageRowLimit, dictionaryPageSize, + writerVersion, codec, compressionLevel, rowGroupCheckMinRecordCount, @@ -706,6 +718,15 @@ private static CompressionCodecName toCodec(String codecAsString) { } } + private static WriterVersion toWriterVersion(String pageVersion) { + try { + return WriterVersion.fromString(pageVersion); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException( + "Unsupported Parquet page version: " + pageVersion + " (must be v1 or v2)"); + } + } + int rowGroupSize() { return rowGroupSize; } @@ -722,6 +743,10 @@ int dictionaryPageSize() { return dictionaryPageSize; } + WriterVersion writerVersion() { + return writerVersion; + } + CompressionCodecName codec() { return codec; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index fbd7a6e97fe2..35a802460710 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -40,12 +40,10 @@ import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.MessageType; public class ParquetFormatModel extends BaseFormatModel, R, MessageType> { - public static final String WRITER_VERSION_KEY = "parquet.writer.version"; private final boolean isBatchReader; public static ParquetFormatModel, Void, Object> forPositionDeletes() { @@ -121,10 +119,6 @@ public ModelWriteBuilder engineSchema(S newSchema) { @Override public ModelWriteBuilder set(String property, String value) { - if (WRITER_VERSION_KEY.equals(property)) { - internal.writerVersion(ParquetProperties.WriterVersion.valueOf(value)); - } - internal.set(property, value); return this; } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java index 7a15f8609823..22f8068c0fa3 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java @@ -179,7 +179,7 @@ public void createInputFile() throws IOException { OutputFile outFile = Files.localOutput(parquetFile); try (FileAppender appender = - Parquet.write(outFile).schema(FILE_SCHEMA).withWriterVersion(writerVersion).build()) { + Parquet.write(outFile).schema(FILE_SCHEMA).writerVersion(writerVersion).build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 20 copies of each record to ensure dictionary-encoding for (int copy = 0; copy < 20; copy += 1) { diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetPageVersion.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetPageVersion.java new file mode 100644 index 000000000000..3be1dce4d9ea --- /dev/null +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetPageVersion.java @@ -0,0 +1,251 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import static org.apache.iceberg.parquet.ParquetWritingTestUtils.createTempFile; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types; +import org.apache.parquet.column.ParquetProperties.WriterVersion; +import org.apache.parquet.column.page.DataPage; +import org.apache.parquet.column.page.DataPageV1; +import org.apache.parquet.column.page.DataPageV2; +import org.apache.parquet.column.page.PageReadStore; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class TestParquetPageVersion { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + private List records; + + @TempDir private Path temp; + + @BeforeEach + void createRecords() { + GenericRecord record = GenericRecord.create(SCHEMA); + + this.records = + ImmutableList.of( + record.copy(ImmutableMap.of("id", 1L, "data", "a")), + record.copy(ImmutableMap.of("id", 2L, "data", "b")), + record.copy(ImmutableMap.of("id", 3L, "data", "c")), + record.copy(ImmutableMap.of("id", 4L, "data", "d")), + record.copy(ImmutableMap.of("id", 5L, "data", "e"))); + } + + @Test + void testWriterDefaultsToPageVersion1() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV1.class); + } + + @Test + void testWriterUsesConfiguredPageVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testDeleteWriterUsesConfiguredPageVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + EqualityDeleteWriter deleteWriter = + Parquet.writeDeletes(outputFile) + .createWriterFunc(GenericParquetWriter::create) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter(); + + try (EqualityDeleteWriter writer = deleteWriter) { + writer.write(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testDeleteWriterUsesDeleteSpecificPageVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + EqualityDeleteWriter deleteWriter = + Parquet.writeDeletes(outputFile) + .createWriterFunc(GenericParquetWriter::create) + .set(TableProperties.PARQUET_PAGE_VERSION, "v1") + .set(TableProperties.DELETE_PARQUET_PAGE_VERSION, "v2") + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter(); + + try (EqualityDeleteWriter writer = deleteWriter) { + writer.write(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testExplicitWriterVersion2OverridesPageVersionProperty() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "v1") + .writerVersion(WriterVersion.PARQUET_2_0) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testExplicitWriterVersion1OverridesPageVersionProperty() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .writerVersion(WriterVersion.PARQUET_1_0) + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV1.class); + } + + @Test + void testPageVersionPropertyAfterWriterVersionSetsVersion() throws IOException { + OutputFile outputFile = newOutputFile(); + + try (FileAppender writer = + Parquet.write(outputFile) + .schema(SCHEMA) + .writerVersion(WriterVersion.PARQUET_1_0) + .set(TableProperties.PARQUET_PAGE_VERSION, "v2") + .createWriterFunc(GenericParquetWriter::create) + .build()) { + writer.addAll(records); + } + + assertThat(firstDataPage(outputFile)).isInstanceOf(DataPageV2.class); + } + + @Test + void testInvalidPageVersionFails() throws IOException { + OutputFile outputFile = newOutputFile(); + + assertThatThrownBy( + () -> + Parquet.write(outputFile) + .schema(SCHEMA) + .set(TableProperties.PARQUET_PAGE_VERSION, "3") + .createWriterFunc(GenericParquetWriter::create) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Unsupported Parquet page version: 3 (must be v1 or v2)"); + } + + @Test + void testInvalidDeletePageVersionFails() throws IOException { + OutputFile outputFile = newOutputFile(); + + assertThatThrownBy( + () -> + Parquet.writeDeletes(outputFile) + .createWriterFunc(GenericParquetWriter::create) + .set(TableProperties.DELETE_PARQUET_PAGE_VERSION, "3") + .overwrite() + .rowSchema(SCHEMA) + .withSpec(PartitionSpec.unpartitioned()) + .equalityFieldIds(1) + .buildEqualityWriter()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Unsupported Parquet page version: 3 (must be v1 or v2)"); + } + + private OutputFile newOutputFile() throws IOException { + return Files.localOutput(createTempFile(temp)); + } + + private DataPage firstDataPage(OutputFile outputFile) throws IOException { + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetIO.file(outputFile.toInputFile()))) { + PageReadStore rowGroup = reader.readNextRowGroup(); + assertThat(rowGroup).isNotNull(); + + DataPage dataPage = + rowGroup + .getPageReader( + reader.getFileMetaData().getSchema().getColumnDescription(new String[] {"id"})) + .readPage(); + assertThat(dataPage).isNotNull(); + return dataPage; + } + } +} From 836bca9c7e840a0698fc828a1cb4658750b947fc Mon Sep 17 00:00:00 2001 From: GuoYu <511955993@qq.com> Date: Mon, 27 Apr 2026 23:10:46 +0800 Subject: [PATCH 115/197] Flink: RewriteDataFile support dynamic filter (#15865) --- .../maintenance/api/RewriteDataFiles.java | 30 +++++++++-- .../operator/DataFileRewritePlanner.java | 9 ++-- .../maintenance/api/TestRewriteDataFiles.java | 52 +++++++++++++++++++ .../operator/OperatorTestBase.java | 38 ++++++++++++++ .../maintenance/operator/RewriteUtil.java | 2 +- .../operator/TestDataFileRewritePlanner.java | 49 +++++++++++++++-- .../operator/TestDataFileRewriteRunner.java | 2 +- 7 files changed, 170 insertions(+), 12 deletions(-) diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java index 9aeee75b1464..f03f33a3fd81 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java @@ -24,6 +24,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.actions.BinPackRewriteFilePlanner; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; @@ -59,7 +60,7 @@ public static class Builder extends MaintenanceTaskBuilder rewriteOptions = Maps.newHashMapWithExpectedSize(6); private long maxRewriteBytes = Long.MAX_VALUE; - private Expression filter = Expressions.alwaysTrue(); + private SerializableSupplier filterSupplier = Expressions::alwaysTrue; private String branch = SnapshotRef.MAIN_BRANCH; @Override @@ -214,9 +215,32 @@ public Builder maxFilesToRewrite(int maxFilesToRewrite) { * * @param newFilter the filter expression to apply * @return this for method chaining + * @deprecated will be removed in 1.12.0. Use {@link #filter(SerializableSupplier)} instead */ + @Deprecated public Builder filter(Expression newFilter) { - this.filter = newFilter; + this.filterSupplier = () -> newFilter; + return this; + } + + /** + * A user-provided supplier of a filter expression that determines which files are considered by + * the rewrite strategy. + * + *

The supplier is evaluated by the planner on every compaction trigger, allowing a fresh + * filter to be produced for each compaction run. + * + *

This is particularly useful for time-relative filters. For example, a supplier such as + * {@code () -> Expressions.greaterThanOrEqual("ts", + * LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString())} ensures that each + * compaction rewrites files from the last 3 days relative to the time the compaction is + * planned, rather than relative to when the job was started. + * + * @param newFilterSupplier the supplier providing the filter expression to apply + * @return this for method chaining + */ + public Builder filter(SerializableSupplier newFilterSupplier) { + this.filterSupplier = newFilterSupplier; return this; } @@ -276,7 +300,7 @@ DataStream append(DataStream trigger) { partialProgressEnabled ? partialProgressMaxCommits : 1, maxRewriteBytes, rewriteOptions, - filter, + filterSupplier, branch)) .name(operatorName(PLANNER_TASK_NAME)) .uid(PLANNER_TASK_NAME + uidSuffix()) diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java index a9360374df28..b78c602c647f 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java @@ -26,6 +26,7 @@ import org.apache.flink.metrics.Counter; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.util.Collector; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.SerializableTable; @@ -62,8 +63,8 @@ public class DataFileRewritePlanner private final long maxRewriteBytes; private final Map rewriterOptions; private transient Counter errorCounter; - private final Expression filter; private final String branch; + private final SerializableSupplier filterSupplier; public DataFileRewritePlanner( String tableName, @@ -73,7 +74,7 @@ public DataFileRewritePlanner( int newPartialProgressMaxCommits, long maxRewriteBytes, Map rewriterOptions, - Expression filter, + SerializableSupplier filterSupplier, String branch) { Preconditions.checkNotNull(tableName, "Table name should no be null"); @@ -89,8 +90,8 @@ public DataFileRewritePlanner( this.partialProgressMaxCommits = newPartialProgressMaxCommits; this.maxRewriteBytes = maxRewriteBytes; this.rewriterOptions = rewriterOptions; - this.filter = filter; this.branch = branch; + this.filterSupplier = filterSupplier; } @Override @@ -125,7 +126,7 @@ public void processElement(Trigger value, Context ctx, Collector o } BinPackRewriteFilePlanner planner = - new BinPackRewriteFilePlanner(table, filter, snapshot.snapshotId(), false); + new BinPackRewriteFilePlanner(table, filterSupplier.get(), snapshot.snapshotId(), false); planner.init(rewriterOptions); FileRewritePlan diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index bb53b5265655..97b8b6786545 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -29,6 +29,7 @@ import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Instant; import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; @@ -529,6 +530,57 @@ void testRewriteWithFilter() throws Exception { createRecord(4, "d"))); } + /** + * By verifying that the creation time of the data content in the builder is later than the + * creation time of the filter condition — if the filter condition is actually created in the + * planner, then all files can be compacted; otherwise, not all files can be compacted — we can + * confirm whether the filter condition is actually created in the planner. + */ + @Test + void testRewriteWithFilterSupplier() throws Exception { + Table table = createTable(); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + // Rewrite data files where id is less than current timestamp in planner + .filter(() -> Expressions.lessThan("id", (int) Instant.now().getEpochSecond())) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + + int epochSecond = (int) Instant.now().getEpochSecond(); + insert(table, epochSecond, "d"); + + assertFileNum(table, 4, 0); + + Thread.sleep(1_000L); + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // There is four files, only id is less than current timestamp will be rewritten. so expect 2 + // files. + assertFileNum(table, 1, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(epochSecond, "d"))); + } + @Test void testBranch() throws Exception { Table table = createTable(); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index b9422a63d646..6dd6cda84f27 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -24,7 +24,10 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; +import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.MetricOptions; @@ -79,6 +82,12 @@ public class OperatorTestBase { ImmutableMap.of(), ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); + private static final Schema SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone())); + protected static final String UID_SUFFIX = "UID-Dummy"; protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); @@ -142,6 +151,21 @@ protected static Table createTable(int formatVersion) { "100000")); } + protected static Table createTableWithTimestampWithoutZone() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE, + PartitionSpec.builderFor(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE).identity("ts").build(), + null, + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + "2", + "flink.max-continuous-empty-commits", + "100000")); + } + protected static Table createTableWithDelete() { return createTableWithDelete(2); } @@ -194,6 +218,20 @@ protected void insert(Table table, Integer id, String data, String extra) throws table.refresh(); } + protected void insertWithTimestampWithoutZone( + Table table, Integer id, String data, LocalDateTime ts) throws IOException { + GenericRecord record = GenericRecord.create(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE); + record.setField("id", id); + record.setField("data", data); + record.setField("ts", ts); + long tsMicros = + TimeUnit.SECONDS.toMicros(ts.toEpochSecond(ZoneOffset.UTC)) + + TimeUnit.NANOSECONDS.toMicros(ts.getNano()); + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(TestHelpers.Row.of(tsMicros), Lists.newArrayList(record)); + table.refresh(); + } + /** * For the same identifier column id this methods simulate the following row operations:

*
  • add an equality delete on oldData diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java index 8a8a2fa194d4..7b8f638b7e2f 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java @@ -57,7 +57,7 @@ static List planDataFileRewrite( 11, 10_000_000L, rewriterOptions, - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java index 16d524f05cf7..8300df8c94eb 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java @@ -24,6 +24,9 @@ import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -107,7 +110,7 @@ void testError() throws Exception { 11, 1L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -174,7 +177,7 @@ void testMaxRewriteBytes() throws Exception { 11, maxRewriteBytes, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -228,7 +231,7 @@ void testBranch() throws Exception { 11, 10_000_000L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, branchName))) { testHarness.open(); @@ -243,6 +246,46 @@ void testBranch() throws Exception { } } + @Test + void testFilterSupplierWithTimestamp() throws Exception { + Table table = createTableWithTimestampWithoutZone(); + + LocalDateTime oldTs = LocalDateTime.now().minusDays(10); + insertWithTimestampWithoutZone(table, 1, "old_a", oldTs); + insertWithTimestampWithoutZone(table, 2, "old_b", oldTs); + + LocalDateTime recentTs = LocalDateTime.now().minusHours(1); + insertWithTimestampWithoutZone(table, 3, "new_a", recentTs); + insertWithTimestampWithoutZone(table, 4, "new_b", recentTs); + + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 10_000_000L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + () -> + Expressions.greaterThanOrEqual( + "ts", + LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString()), + SnapshotRef.MAIN_BRANCH))) { + testHarness.open(); + + trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + List planned = testHarness.extractOutputValues(); + + assertThat(planned).hasSize(1); + assertThat(planned.get(0).group().fileScanTasks()).hasSize(2); + } + } + void assertRewriteFileGroup( DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { assertThat(plannedGroup.table().currentSnapshot().snapshotId()) diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java index 9202a1df92af..62b29e7c017a 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java @@ -309,7 +309,7 @@ void testSplitSize() throws Exception { "2", TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize)), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); From b0f022ff29945fe70f51485a1f1cda3d35eed8ca Mon Sep 17 00:00:00 2001 From: GuoYu <511955993@qq.com> Date: Tue, 28 Apr 2026 04:52:26 +0800 Subject: [PATCH 116/197] Flink:Backport RewriteDataFile support dynamic filter (#16132) --- .../maintenance/api/RewriteDataFiles.java | 30 +++++++++-- .../operator/DataFileRewritePlanner.java | 9 ++-- .../maintenance/api/TestRewriteDataFiles.java | 52 +++++++++++++++++++ .../operator/OperatorTestBase.java | 38 ++++++++++++++ .../maintenance/operator/RewriteUtil.java | 2 +- .../operator/TestDataFileRewritePlanner.java | 49 +++++++++++++++-- .../operator/TestDataFileRewriteRunner.java | 2 +- .../maintenance/api/RewriteDataFiles.java | 30 +++++++++-- .../operator/DataFileRewritePlanner.java | 9 ++-- .../maintenance/api/TestRewriteDataFiles.java | 52 +++++++++++++++++++ .../operator/OperatorTestBase.java | 38 ++++++++++++++ .../maintenance/operator/RewriteUtil.java | 2 +- .../operator/TestDataFileRewritePlanner.java | 49 +++++++++++++++-- .../operator/TestDataFileRewriteRunner.java | 2 +- 14 files changed, 340 insertions(+), 24 deletions(-) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java index 9aeee75b1464..f03f33a3fd81 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java @@ -24,6 +24,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.actions.BinPackRewriteFilePlanner; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; @@ -59,7 +60,7 @@ public static class Builder extends MaintenanceTaskBuilder rewriteOptions = Maps.newHashMapWithExpectedSize(6); private long maxRewriteBytes = Long.MAX_VALUE; - private Expression filter = Expressions.alwaysTrue(); + private SerializableSupplier filterSupplier = Expressions::alwaysTrue; private String branch = SnapshotRef.MAIN_BRANCH; @Override @@ -214,9 +215,32 @@ public Builder maxFilesToRewrite(int maxFilesToRewrite) { * * @param newFilter the filter expression to apply * @return this for method chaining + * @deprecated will be removed in 1.12.0. Use {@link #filter(SerializableSupplier)} instead */ + @Deprecated public Builder filter(Expression newFilter) { - this.filter = newFilter; + this.filterSupplier = () -> newFilter; + return this; + } + + /** + * A user-provided supplier of a filter expression that determines which files are considered by + * the rewrite strategy. + * + *

    The supplier is evaluated by the planner on every compaction trigger, allowing a fresh + * filter to be produced for each compaction run. + * + *

    This is particularly useful for time-relative filters. For example, a supplier such as + * {@code () -> Expressions.greaterThanOrEqual("ts", + * LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString())} ensures that each + * compaction rewrites files from the last 3 days relative to the time the compaction is + * planned, rather than relative to when the job was started. + * + * @param newFilterSupplier the supplier providing the filter expression to apply + * @return this for method chaining + */ + public Builder filter(SerializableSupplier newFilterSupplier) { + this.filterSupplier = newFilterSupplier; return this; } @@ -276,7 +300,7 @@ DataStream append(DataStream trigger) { partialProgressEnabled ? partialProgressMaxCommits : 1, maxRewriteBytes, rewriteOptions, - filter, + filterSupplier, branch)) .name(operatorName(PLANNER_TASK_NAME)) .uid(PLANNER_TASK_NAME + uidSuffix()) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java index feb2dd26c807..9c3b44b9d544 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java @@ -26,6 +26,7 @@ import org.apache.flink.metrics.Counter; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.util.Collector; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.SerializableTable; @@ -62,8 +63,8 @@ public class DataFileRewritePlanner private final long maxRewriteBytes; private final Map rewriterOptions; private transient Counter errorCounter; - private final Expression filter; private final String branch; + private final SerializableSupplier filterSupplier; public DataFileRewritePlanner( String tableName, @@ -73,7 +74,7 @@ public DataFileRewritePlanner( int newPartialProgressMaxCommits, long maxRewriteBytes, Map rewriterOptions, - Expression filter, + SerializableSupplier filterSupplier, String branch) { Preconditions.checkNotNull(tableName, "Table name should no be null"); @@ -89,8 +90,8 @@ public DataFileRewritePlanner( this.partialProgressMaxCommits = newPartialProgressMaxCommits; this.maxRewriteBytes = maxRewriteBytes; this.rewriterOptions = rewriterOptions; - this.filter = filter; this.branch = branch; + this.filterSupplier = filterSupplier; } @Override @@ -125,7 +126,7 @@ public void processElement(Trigger value, Context ctx, Collector o } BinPackRewriteFilePlanner planner = - new BinPackRewriteFilePlanner(table, filter, snapshot.snapshotId(), false); + new BinPackRewriteFilePlanner(table, filterSupplier.get(), snapshot.snapshotId(), false); planner.init(rewriterOptions); FileRewritePlan diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index bb53b5265655..97b8b6786545 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -29,6 +29,7 @@ import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Instant; import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; @@ -529,6 +530,57 @@ void testRewriteWithFilter() throws Exception { createRecord(4, "d"))); } + /** + * By verifying that the creation time of the data content in the builder is later than the + * creation time of the filter condition — if the filter condition is actually created in the + * planner, then all files can be compacted; otherwise, not all files can be compacted — we can + * confirm whether the filter condition is actually created in the planner. + */ + @Test + void testRewriteWithFilterSupplier() throws Exception { + Table table = createTable(); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + // Rewrite data files where id is less than current timestamp in planner + .filter(() -> Expressions.lessThan("id", (int) Instant.now().getEpochSecond())) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + + int epochSecond = (int) Instant.now().getEpochSecond(); + insert(table, epochSecond, "d"); + + assertFileNum(table, 4, 0); + + Thread.sleep(1_000L); + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // There is four files, only id is less than current timestamp will be rewritten. so expect 2 + // files. + assertFileNum(table, 1, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(epochSecond, "d"))); + } + @Test void testBranch() throws Exception { Table table = createTable(); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index 5eecc5a803d3..93291e8cc29a 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -24,7 +24,10 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; +import java.util.concurrent.TimeUnit; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.MetricOptions; import org.apache.flink.core.execution.JobClient; @@ -79,6 +82,12 @@ public class OperatorTestBase { ImmutableMap.of(), ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); + private static final Schema SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone())); + protected static final String UID_SUFFIX = "UID-Dummy"; protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); @@ -142,6 +151,21 @@ protected static Table createTable(int formatVersion) { "100000")); } + protected static Table createTableWithTimestampWithoutZone() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE, + PartitionSpec.builderFor(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE).identity("ts").build(), + null, + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + "2", + "flink.max-continuous-empty-commits", + "100000")); + } + protected static Table createTableWithDelete() { return createTableWithDelete(2); } @@ -194,6 +218,20 @@ protected void insert(Table table, Integer id, String data, String extra) throws table.refresh(); } + protected void insertWithTimestampWithoutZone( + Table table, Integer id, String data, LocalDateTime ts) throws IOException { + GenericRecord record = GenericRecord.create(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE); + record.setField("id", id); + record.setField("data", data); + record.setField("ts", ts); + long tsMicros = + TimeUnit.SECONDS.toMicros(ts.toEpochSecond(ZoneOffset.UTC)) + + TimeUnit.NANOSECONDS.toMicros(ts.getNano()); + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(TestHelpers.Row.of(tsMicros), Lists.newArrayList(record)); + table.refresh(); + } + /** * For the same identifier column id this methods simulate the following row operations:

  • *
  • add an equality delete on oldData diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java index 8a8a2fa194d4..7b8f638b7e2f 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java @@ -57,7 +57,7 @@ static List planDataFileRewrite( 11, 10_000_000L, rewriterOptions, - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java index 16d524f05cf7..8300df8c94eb 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java @@ -24,6 +24,9 @@ import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -107,7 +110,7 @@ void testError() throws Exception { 11, 1L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -174,7 +177,7 @@ void testMaxRewriteBytes() throws Exception { 11, maxRewriteBytes, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -228,7 +231,7 @@ void testBranch() throws Exception { 11, 10_000_000L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, branchName))) { testHarness.open(); @@ -243,6 +246,46 @@ void testBranch() throws Exception { } } + @Test + void testFilterSupplierWithTimestamp() throws Exception { + Table table = createTableWithTimestampWithoutZone(); + + LocalDateTime oldTs = LocalDateTime.now().minusDays(10); + insertWithTimestampWithoutZone(table, 1, "old_a", oldTs); + insertWithTimestampWithoutZone(table, 2, "old_b", oldTs); + + LocalDateTime recentTs = LocalDateTime.now().minusHours(1); + insertWithTimestampWithoutZone(table, 3, "new_a", recentTs); + insertWithTimestampWithoutZone(table, 4, "new_b", recentTs); + + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 10_000_000L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + () -> + Expressions.greaterThanOrEqual( + "ts", + LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString()), + SnapshotRef.MAIN_BRANCH))) { + testHarness.open(); + + trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + List planned = testHarness.extractOutputValues(); + + assertThat(planned).hasSize(1); + assertThat(planned.get(0).group().fileScanTasks()).hasSize(2); + } + } + void assertRewriteFileGroup( DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { assertThat(plannedGroup.table().currentSnapshot().snapshotId()) diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java index 9202a1df92af..62b29e7c017a 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java @@ -309,7 +309,7 @@ void testSplitSize() throws Exception { "2", TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize)), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java index 9aeee75b1464..f03f33a3fd81 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/RewriteDataFiles.java @@ -24,6 +24,7 @@ import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; import org.apache.flink.streaming.api.watermark.Watermark; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.actions.BinPackRewriteFilePlanner; import org.apache.iceberg.actions.SizeBasedFileRewritePlanner; @@ -59,7 +60,7 @@ public static class Builder extends MaintenanceTaskBuilder rewriteOptions = Maps.newHashMapWithExpectedSize(6); private long maxRewriteBytes = Long.MAX_VALUE; - private Expression filter = Expressions.alwaysTrue(); + private SerializableSupplier filterSupplier = Expressions::alwaysTrue; private String branch = SnapshotRef.MAIN_BRANCH; @Override @@ -214,9 +215,32 @@ public Builder maxFilesToRewrite(int maxFilesToRewrite) { * * @param newFilter the filter expression to apply * @return this for method chaining + * @deprecated will be removed in 1.12.0. Use {@link #filter(SerializableSupplier)} instead */ + @Deprecated public Builder filter(Expression newFilter) { - this.filter = newFilter; + this.filterSupplier = () -> newFilter; + return this; + } + + /** + * A user-provided supplier of a filter expression that determines which files are considered by + * the rewrite strategy. + * + *

    The supplier is evaluated by the planner on every compaction trigger, allowing a fresh + * filter to be produced for each compaction run. + * + *

    This is particularly useful for time-relative filters. For example, a supplier such as + * {@code () -> Expressions.greaterThanOrEqual("ts", + * LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString())} ensures that each + * compaction rewrites files from the last 3 days relative to the time the compaction is + * planned, rather than relative to when the job was started. + * + * @param newFilterSupplier the supplier providing the filter expression to apply + * @return this for method chaining + */ + public Builder filter(SerializableSupplier newFilterSupplier) { + this.filterSupplier = newFilterSupplier; return this; } @@ -276,7 +300,7 @@ DataStream append(DataStream trigger) { partialProgressEnabled ? partialProgressMaxCommits : 1, maxRewriteBytes, rewriteOptions, - filter, + filterSupplier, branch)) .name(operatorName(PLANNER_TASK_NAME)) .uid(PLANNER_TASK_NAME + uidSuffix()) diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java index a9360374df28..b78c602c647f 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/operator/DataFileRewritePlanner.java @@ -26,6 +26,7 @@ import org.apache.flink.metrics.Counter; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.util.Collector; +import org.apache.flink.util.function.SerializableSupplier; import org.apache.iceberg.DataFile; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.SerializableTable; @@ -62,8 +63,8 @@ public class DataFileRewritePlanner private final long maxRewriteBytes; private final Map rewriterOptions; private transient Counter errorCounter; - private final Expression filter; private final String branch; + private final SerializableSupplier filterSupplier; public DataFileRewritePlanner( String tableName, @@ -73,7 +74,7 @@ public DataFileRewritePlanner( int newPartialProgressMaxCommits, long maxRewriteBytes, Map rewriterOptions, - Expression filter, + SerializableSupplier filterSupplier, String branch) { Preconditions.checkNotNull(tableName, "Table name should no be null"); @@ -89,8 +90,8 @@ public DataFileRewritePlanner( this.partialProgressMaxCommits = newPartialProgressMaxCommits; this.maxRewriteBytes = maxRewriteBytes; this.rewriterOptions = rewriterOptions; - this.filter = filter; this.branch = branch; + this.filterSupplier = filterSupplier; } @Override @@ -125,7 +126,7 @@ public void processElement(Trigger value, Context ctx, Collector o } BinPackRewriteFilePlanner planner = - new BinPackRewriteFilePlanner(table, filter, snapshot.snapshotId(), false); + new BinPackRewriteFilePlanner(table, filterSupplier.get(), snapshot.snapshotId(), false); planner.init(rewriterOptions); FileRewritePlan diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index bb53b5265655..97b8b6786545 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -29,6 +29,7 @@ import static org.apache.iceberg.flink.maintenance.operator.TableMaintenanceMetrics.REMOVED_DATA_FILE_SIZE_METRIC; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Instant; import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; @@ -529,6 +530,57 @@ void testRewriteWithFilter() throws Exception { createRecord(4, "d"))); } + /** + * By verifying that the creation time of the data content in the builder is later than the + * creation time of the filter condition — if the filter condition is actually created in the + * planner, then all files can be compacted; otherwise, not all files can be compacted — we can + * confirm whether the filter condition is actually created in the planner. + */ + @Test + void testRewriteWithFilterSupplier() throws Exception { + Table table = createTable(); + + appendRewriteDataFiles( + RewriteDataFiles.builder() + .parallelism(2) + .deleteFileThreshold(10) + .targetFileSizeBytes(1_000_000L) + .maxFileGroupSizeBytes(10_000_000L) + .maxFileSizeBytes(2_000_000L) + .minFileSizeBytes(500_000L) + .minInputFiles(2) + // Rewrite data files where id is less than current timestamp in planner + .filter(() -> Expressions.lessThan("id", (int) Instant.now().getEpochSecond())) + .partialProgressEnabled(true) + .partialProgressMaxCommits(1) + .maxRewriteBytes(100_000L) + .rewriteAll(false)); + + insert(table, 1, "a"); + insert(table, 2, "b"); + insert(table, 3, "c"); + + int epochSecond = (int) Instant.now().getEpochSecond(); + insert(table, epochSecond, "d"); + + assertFileNum(table, 4, 0); + + Thread.sleep(1_000L); + runAndWaitForSuccess(infra.env(), infra.source(), infra.sink()); + + // There is four files, only id is less than current timestamp will be rewritten. so expect 2 + // files. + assertFileNum(table, 1, 0); + + SimpleDataUtil.assertTableRecords( + table, + ImmutableList.of( + createRecord(1, "a"), + createRecord(2, "b"), + createRecord(3, "c"), + createRecord(epochSecond, "d"))); + } + @Test void testBranch() throws Exception { Table table = createTable(); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index b9422a63d646..6dd6cda84f27 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -24,7 +24,10 @@ import java.io.File; import java.io.IOException; import java.nio.file.Path; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; +import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.MetricOptions; @@ -79,6 +82,12 @@ public class OperatorTestBase { ImmutableMap.of(), ImmutableSet.of(SimpleDataUtil.SCHEMA.columns().get(0).fieldId())); + private static final Schema SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE = + new Schema( + Types.NestedField.optional(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()), + Types.NestedField.optional(3, "ts", Types.TimestampType.withoutZone())); + protected static final String UID_SUFFIX = "UID-Dummy"; protected static final String SLOT_SHARING_GROUP = "SlotSharingGroup"; protected static final TriggerLockFactory LOCK_FACTORY = new MemoryLockFactory(); @@ -142,6 +151,21 @@ protected static Table createTable(int formatVersion) { "100000")); } + protected static Table createTableWithTimestampWithoutZone() { + return CATALOG_EXTENSION + .catalog() + .createTable( + TestFixtures.TABLE_IDENTIFIER, + SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE, + PartitionSpec.builderFor(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE).identity("ts").build(), + null, + ImmutableMap.of( + TableProperties.FORMAT_VERSION, + "2", + "flink.max-continuous-empty-commits", + "100000")); + } + protected static Table createTableWithDelete() { return createTableWithDelete(2); } @@ -194,6 +218,20 @@ protected void insert(Table table, Integer id, String data, String extra) throws table.refresh(); } + protected void insertWithTimestampWithoutZone( + Table table, Integer id, String data, LocalDateTime ts) throws IOException { + GenericRecord record = GenericRecord.create(SCHEMA_WITH_TIMESTAMP_WITHOUT_ZONE); + record.setField("id", id); + record.setField("data", data); + record.setField("ts", ts); + long tsMicros = + TimeUnit.SECONDS.toMicros(ts.toEpochSecond(ZoneOffset.UTC)) + + TimeUnit.NANOSECONDS.toMicros(ts.getNano()); + new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + .appendToTable(TestHelpers.Row.of(tsMicros), Lists.newArrayList(record)); + table.refresh(); + } + /** * For the same identifier column id this methods simulate the following row operations:

  • *
  • add an equality delete on oldData diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java index 8a8a2fa194d4..7b8f638b7e2f 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/RewriteUtil.java @@ -57,7 +57,7 @@ static List planDataFileRewrite( 11, 10_000_000L, rewriterOptions, - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java index 16d524f05cf7..8300df8c94eb 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewritePlanner.java @@ -24,6 +24,9 @@ import static org.apache.iceberg.flink.maintenance.operator.RewriteUtil.planDataFileRewrite; import static org.assertj.core.api.Assertions.assertThat; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -107,7 +110,7 @@ void testError() throws Exception { 11, 1L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -174,7 +177,7 @@ void testMaxRewriteBytes() throws Exception { 11, maxRewriteBytes, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); @@ -228,7 +231,7 @@ void testBranch() throws Exception { 11, 10_000_000L, ImmutableMap.of(MIN_INPUT_FILES, "2"), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, branchName))) { testHarness.open(); @@ -243,6 +246,46 @@ void testBranch() throws Exception { } } + @Test + void testFilterSupplierWithTimestamp() throws Exception { + Table table = createTableWithTimestampWithoutZone(); + + LocalDateTime oldTs = LocalDateTime.now().minusDays(10); + insertWithTimestampWithoutZone(table, 1, "old_a", oldTs); + insertWithTimestampWithoutZone(table, 2, "old_b", oldTs); + + LocalDateTime recentTs = LocalDateTime.now().minusHours(1); + insertWithTimestampWithoutZone(table, 3, "new_a", recentTs); + insertWithTimestampWithoutZone(table, 4, "new_b", recentTs); + + try (OneInputStreamOperatorTestHarness + testHarness = + ProcessFunctionTestHarnesses.forProcessFunction( + new DataFileRewritePlanner( + OperatorTestBase.DUMMY_TABLE_NAME, + OperatorTestBase.DUMMY_TABLE_NAME, + 0, + tableLoader(), + 11, + 10_000_000L, + ImmutableMap.of(MIN_INPUT_FILES, "2"), + () -> + Expressions.greaterThanOrEqual( + "ts", + LocalDateTime.now(ZoneOffset.UTC).minus(Duration.ofDays(3)).toString()), + SnapshotRef.MAIN_BRANCH))) { + testHarness.open(); + + trigger(testHarness); + + assertThat(testHarness.getSideOutput(TaskResultAggregator.ERROR_STREAM)).isNull(); + List planned = testHarness.extractOutputValues(); + + assertThat(planned).hasSize(1); + assertThat(planned.get(0).group().fileScanTasks()).hasSize(2); + } + } + void assertRewriteFileGroup( DataFileRewritePlanner.PlannedGroup plannedGroup, Table table, Set files) { assertThat(plannedGroup.table().currentSnapshot().snapshotId()) diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java index 9202a1df92af..62b29e7c017a 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/TestDataFileRewriteRunner.java @@ -309,7 +309,7 @@ void testSplitSize() throws Exception { "2", TARGET_FILE_SIZE_BYTES, String.valueOf(targetFileSize)), - Expressions.alwaysTrue(), + Expressions::alwaysTrue, SnapshotRef.MAIN_BRANCH))) { testHarness.open(); From 4e118e3ca2e5a19d42797bb2518eabefcccbd201 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Mon, 27 Apr 2026 17:54:17 -0700 Subject: [PATCH 117/197] Spark 4.1: Update LICENSE and NOTICE for 1.11. (#16104) * Spark 4.1: Update LICENSE and NOTICE for 1.11. * Spark 4.1: Fix accidental merge of Commons and HttpComponents. * Spark 4.1: Update LICENSE to include ORC bundled deps. --- spark/v4.1/spark-runtime/LICENSE | 276 +++---------------------------- spark/v4.1/spark-runtime/NOTICE | 101 ----------- 2 files changed, 20 insertions(+), 357 deletions(-) diff --git a/spark/v4.1/spark-runtime/LICENSE b/spark/v4.1/spark-runtime/LICENSE index a67296eb412c..24a9e3706d17 100644 --- a/spark/v4.1/spark-runtime/LICENSE +++ b/spark/v4.1/spark-runtime/LICENSE @@ -227,7 +227,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +235,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +243,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +251,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +266,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +274,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -390,20 +398,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -428,42 +422,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. @@ -540,19 +498,11 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -573,16 +523,16 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -594,195 +544,9 @@ License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/e -------------------------------------------------------------------------------- -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v4.1/spark-runtime/NOTICE b/spark/v4.1/spark-runtime/NOTICE index 68abd73906b1..17989b43a371 100644 --- a/spark/v4.1/spark-runtime/NOTICE +++ b/spark/v4.1/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -392,68 +356,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | Arthur De Magalhaes arthurdm@ca.ibm.com | --------------------------------------------------------------------------------- - -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa From 9bd214e5063cccb41dbc724462ad0a3f8fed46f2 Mon Sep 17 00:00:00 2001 From: drexler-sky Date: Mon, 27 Apr 2026 19:33:55 -0700 Subject: [PATCH 118/197] Arrow: Align vectorized reader handling of unsigned Parquet integers with BaseParquetReaders (#16006) * Arrow reader: reject unsigned Parquet integer columns with clear error The vectorized Arrow reader was silently reading unsigned Parquet integer columns (uint8, uint16, uint32, uint64) as signed, producing incorrect values for any value exceeding the signed maximum for that bit width. Since Iceberg has no unsigned integer type, throw UnsupportedOperationException when the Arrow reader encounters an unsigned integer logical type annotation, consistent with how the schema conversion layer already rejects uint64. Fixes #14547 * Apply spotless formatting * address comments * change to ParameterizedTest and also reuse common code --------- Co-authored-by: Evan Wu --- .../vectorized/VectorizedArrowReader.java | 8 ++ .../arrow/vectorized/TestArrowReader.java | 110 ++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java index 2cc7cde4541a..e9ebed2826f4 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedArrowReader.java @@ -588,10 +588,18 @@ public Optional visit( int bitWidth = intLogicalType.getBitWidth(); if (bitWidth == 8 || bitWidth == 16 || bitWidth == 32) { + // Iceberg has no unsigned integer type. Reading UINT32 into a 32-bit signed value would + // silently produce negative results for inputs above Integer.MAX_VALUE. UINT8 and UINT16 + // both fit losslessly in a signed int32 and are allowed, matching the policy in + // BaseParquetReaders for the non-vectorized path. + Preconditions.checkArgument( + intLogicalType.isSigned() || bitWidth < 32, "Cannot read UINT32 as an int value"); ((IntVector) vector).allocateNew(batchSize); return Optional.of( new LogicalTypeVisitorResult(vector, ReadType.INT, (int) IntVector.TYPE_WIDTH)); } else if (bitWidth == 64) { + Preconditions.checkArgument( + intLogicalType.isSigned(), "Cannot read UINT64 as a long value"); ((BigIntVector) vector).allocateNew(batchSize); return Optional.of( new LogicalTypeVisitorResult(vector, ReadType.LONG, (int) BigIntVector.TYPE_WIDTH)); diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java index 34e83de15207..cf3eb2700265 100644 --- a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestArrowReader.java @@ -21,6 +21,7 @@ import static org.apache.iceberg.Files.localInput; import static org.apache.parquet.schema.Types.primitive; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -41,6 +42,7 @@ import java.util.concurrent.TimeUnit; import java.util.function.BiFunction; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.DateDayVector; @@ -101,6 +103,9 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; /** * Test cases for {@link ArrowReader}. @@ -383,6 +388,111 @@ public void testTimestampMillisAreReadCorrectly() throws Exception { assertThat(totalRowsRead).as("Should read all rows").isEqualTo(millisValues.size()); } + @ParameterizedTest + @MethodSource("rejectedUnsignedIntegerCases") + public void testUnsignedIntegerColumnThrowsException( + int unsignedBitWidth, + PrimitiveType.PrimitiveTypeName physicalType, + Schema schema, + String expectedMessage) + throws Exception { + Table table = createSingleRowUnsignedIntTable(schema, physicalType, unsignedBitWidth, 100L); + + assertThatThrownBy( + () -> { + try (VectorizedTableScanIterable vectorizedReader = + new VectorizedTableScanIterable(table.newScan(), 1024, false)) { + for (ColumnarBatch batch : vectorizedReader) { + batch.createVectorSchemaRootFromVectors().close(); + } + } + }) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining(expectedMessage); + } + + @ParameterizedTest + @MethodSource("acceptedUnsignedSmallIntegerCases") + public void testUnsignedSmallIntegerColumnRoundtrips(int unsignedBitWidth, int value) + throws Exception { + Schema schema = new Schema(Types.NestedField.optional(1, "col", Types.IntegerType.get())); + Table table = + createSingleRowUnsignedIntTable( + schema, PrimitiveType.PrimitiveTypeName.INT32, unsignedBitWidth, value); + + int totalRows = 0; + try (VectorizedTableScanIterable vectorizedReader = + new VectorizedTableScanIterable(table.newScan(), 1024, false)) { + for (ColumnarBatch batch : vectorizedReader) { + VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors(); + assertThat(((IntVector) root.getVector("col")).get(0)) + .as("UINT%d value should round-trip through int", unsignedBitWidth) + .isEqualTo(value); + totalRows += root.getRowCount(); + root.close(); + } + } + + assertThat(totalRows).isEqualTo(1); + } + + private static Stream rejectedUnsignedIntegerCases() { + return Stream.of( + Arguments.of( + 32, + PrimitiveType.PrimitiveTypeName.INT32, + new Schema(Types.NestedField.optional(1, "col", Types.IntegerType.get())), + "Cannot read UINT32 as an int value"), + Arguments.of( + 64, + PrimitiveType.PrimitiveTypeName.INT64, + new Schema(Types.NestedField.optional(1, "col", Types.LongType.get())), + "Cannot read UINT64 as a long value")); + } + + private static Stream acceptedUnsignedSmallIntegerCases() { + return Stream.of(Arguments.of(8, 250), Arguments.of(16, 50000)); + } + + private Table createSingleRowUnsignedIntTable( + Schema schema, PrimitiveType.PrimitiveTypeName physicalType, int unsignedBitWidth, long value) + throws IOException { + tables = new HadoopTables(); + Table table = tables.create(schema, tempDir.toURI() + "/uint" + unsignedBitWidth); + + MessageType parquetSchema = + new MessageType( + "test", + primitive(physicalType, Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.intType(unsignedBitWidth, false)) + .id(1) + .named("col")); + + File testFile = + new File(tempDir, "unsigned-int" + unsignedBitWidth + "-" + System.nanoTime() + ".parquet"); + try (ParquetWriter writer = + ExampleParquetWriter.builder(new Path(testFile.toURI())).withType(parquetSchema).build()) { + SimpleGroupFactory factory = new SimpleGroupFactory(parquetSchema); + Group group = factory.newGroup(); + if (physicalType == PrimitiveType.PrimitiveTypeName.INT64) { + group.add("col", value); + } else { + group.add("col", (int) value); + } + writer.write(group); + } + + DataFile dataFile = + DataFiles.builder(PartitionSpec.unpartitioned()) + .withPath(testFile.getAbsolutePath()) + .withFileSizeInBytes(testFile.length()) + .withFormat(FileFormat.PARQUET) + .withRecordCount(1) + .build(); + table.newAppend().appendFile(dataFile).commit(); + return table; + } + /** * Run the following verifications: * From 57409faedb184eb3b410a9cbbab52a4e5c0b6f0a Mon Sep 17 00:00:00 2001 From: Bharath Krishna Date: Mon, 27 Apr 2026 23:49:07 -0700 Subject: [PATCH 119/197] Core: Fix child AuthSession inheriting parent's expiresAtMillis (#15999) --- .../apache/iceberg/rest/auth/OAuth2Util.java | 3 + .../iceberg/rest/auth/TestOAuth2Util.java | 97 +++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java index c2b47e6e944f..7a244bff70f6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java +++ b/core/src/main/java/org/apache/iceberg/rest/auth/OAuth2Util.java @@ -529,6 +529,7 @@ public Pair refresh(RESTClient client) { .from(config()) .token(response.token()) .tokenType(response.issuedTokenType()) + .expiresAtMillis(OAuth2Util.expiresAtMillis(response.token())) .build(); Map currentHeaders = this.headers; this.headers = RESTUtil.merge(currentHeaders, authHeaders(config.token())); @@ -618,6 +619,7 @@ public static AuthSession fromAccessToken( .from(parent.config()) .token(token) .tokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .expiresAtMillis(OAuth2Util.expiresAtMillis(token)) .build()); long startTimeMillis = System.currentTimeMillis(); @@ -699,6 +701,7 @@ private static AuthSession fromTokenResponse( .token(response.token()) .tokenType(issuedTokenType) .credential(credential) + .expiresAtMillis(OAuth2Util.expiresAtMillis(response.token())) .build()); Long expiresAtMillis = session.expiresAtMillis(); diff --git a/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java b/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java index fbcb87fb06e2..4a6fbf7a1cdc 100644 --- a/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java +++ b/core/src/test/java/org/apache/iceberg/rest/auth/TestOAuth2Util.java @@ -26,9 +26,13 @@ import static org.mockito.ArgumentMatchers.anyMap; import static org.mockito.ArgumentMatchers.argThat; +import com.nimbusds.jwt.JWTClaimsSet; +import com.nimbusds.jwt.PlainJWT; import java.io.IOException; import java.util.Map; +import java.util.concurrent.TimeUnit; import org.apache.iceberg.rest.RESTClient; +import org.apache.iceberg.rest.auth.OAuth2Util.AuthSession; import org.apache.iceberg.rest.responses.OAuthTokenResponse; import org.junit.jupiter.api.Test; import org.mockito.Mockito; @@ -135,4 +139,97 @@ public void testCredentialFlowForSessionRefresh() throws IOException { any()); } } + + @Test + void fromTokenResponseUsesChildTokenExpiry() { + AuthSession parent = parentSession(7200); + OAuthTokenResponse response = childTokenResponse(tokenWithExp(300), 300); + + AuthSession child = + AuthSession.fromTokenResponse(null, null, response, System.currentTimeMillis(), parent); + assertThat(child.expiresAtMillis()) + .as("Child session should use the child token's exp, not the parent's") + .isEqualTo(TimeUnit.SECONDS.toMillis(300)); + } + + @Test + void fromTokenResponseOpaqueTokenDoesNotInheritParentExpiry() { + AuthSession parent = parentSession(7200); + OAuthTokenResponse response = childTokenResponse("opaque-access-token", 600); + + AuthSession child = + AuthSession.fromTokenResponse(null, null, response, System.currentTimeMillis(), parent); + + assertThat(child.expiresAtMillis()) + .as("Child session with opaque token should not inherit parent's expiresAtMillis") + .isNull(); + } + + @Test + void fromAccessTokenUsesChildTokenExpiry() { + AuthSession parent = parentSession(7200); + String childToken = tokenWithExp(300); + + AuthSession child = AuthSession.fromAccessToken(null, null, childToken, null, parent); + assertThat(child.expiresAtMillis()) + .as("Child session should use the child token's exp, not the parent's") + .isEqualTo(TimeUnit.SECONDS.toMillis(300)); + } + + @Test + void refreshUsesRefreshedTokenExpiry() throws IOException { + String parentToken = tokenWithExp(7200); + String refreshedToken = tokenWithExp(500); + + AuthConfig authConfig = + AuthConfig.builder() + .token(parentToken) + .tokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .keepRefreshed(true) + .credential("testClientId:testClientSecret") + .oauth2ServerUri("/v1/token") + .expiresAtMillis(OAuth2Util.expiresAtMillis(parentToken)) + .build(); + + OAuthTokenResponse response = childTokenResponse(refreshedToken, 500); + + try (RESTClient client = Mockito.mock(RESTClient.class); + AuthSession session = new AuthSession(Map.of(), authConfig)) { + Mockito.when(client.postForm(any(), anyMap(), any(), anyMap(), any())).thenReturn(response); + + session.refresh(client); + + assertThat(session.expiresAtMillis()) + .as("After refresh, session should use the refreshed token's exp") + .isEqualTo(TimeUnit.SECONDS.toMillis(500)); + } + } + + private static AuthSession parentSession(long expSeconds) { + String parentToken = tokenWithExp(expSeconds); + AuthConfig parentConfig = + AuthConfig.builder() + .token(parentToken) + .tokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .keepRefreshed(false) + .build(); + AuthSession parent = new AuthSession(Map.of(), parentConfig); + assertThat(parent.expiresAtMillis()).isEqualTo(TimeUnit.SECONDS.toMillis(expSeconds)); + return parent; + } + + private static OAuthTokenResponse childTokenResponse(String token, int expiresInSeconds) { + return OAuthTokenResponse.builder() + .withToken(token) + .withTokenType(BEARER) + .withIssuedTokenType(OAuth2Properties.ACCESS_TOKEN_TYPE) + .setExpirationInSeconds(expiresInSeconds) + .build(); + } + + private static String tokenWithExp(long expSeconds) { + JWTClaimsSet claimsSet = + new JWTClaimsSet.Builder().subject("test").claim("exp", expSeconds).build(); + return new PlainJWT(claimsSet).serialize(); + } } From d71583f58859131668cbf85b0df293cd32ef165a Mon Sep 17 00:00:00 2001 From: Neelesh Salian Date: Tue, 28 Apr 2026 06:21:17 -0700 Subject: [PATCH 120/197] Spark, Hive: Fix snapshot procedure for tables with Variant columns (#15964) --- .../apache/iceberg/hive/HiveSchemaUtil.java | 2 + .../iceberg/hive/TestHiveSchemaUtil.java | 7 +++ .../TestSnapshotTableProcedure.java | 50 +++++++++++++++++++ .../apache/iceberg/spark/SparkTableUtil.java | 40 +++++++++++---- .../TestSnapshotTableProcedure.java | 50 +++++++++++++++++++ .../apache/iceberg/spark/SparkTableUtil.java | 40 +++++++++++---- 6 files changed, 169 insertions(+), 20 deletions(-) diff --git a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java index 20f9eb7f616e..d1ff5db66ad4 100644 --- a/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java +++ b/hive-metastore/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java @@ -167,6 +167,8 @@ private static String convertToTypeString(Type type) { case FIXED: case BINARY: return "binary"; + case VARIANT: + return "unknown"; case DECIMAL: final Types.DecimalType decimalType = (Types.DecimalType) type; return String.format("decimal(%s,%s)", decimalType.precision(), decimalType.scale()); diff --git a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java index 1592a3461b40..59c19a5d095d 100644 --- a/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java +++ b/hive-metastore/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java @@ -205,6 +205,13 @@ public void testConversionWithoutLastComment() { assertThat(schema.asStruct()).isEqualTo(expected.asStruct()); } + @Test + public void testVariantTypeConvertToHiveSchema() { + Schema schema = new Schema(optional(0, "variant_field", Types.VariantType.get())); + List hiveSchema = HiveSchemaUtil.convert(schema); + assertThat(hiveSchema).containsExactly(new FieldSchema("variant_field", "unknown", null)); + } + protected List getSupportedFieldSchemas() { List fields = Lists.newArrayListWithCapacity(10); fields.add(new FieldSchema("c_float", serdeConstants.FLOAT_TYPE_NAME, "float comment")); diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 19800c2f4666..3f8b574126ba 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -375,4 +375,54 @@ public void testSnapshotPartitionedV1() throws IOException { } } } + + @TestTemplate + public void testSnapshotWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet LOCATION '%s'", + SOURCE_NAME, location); + sql("INSERT INTO TABLE %s VALUES (1, parse_json('{\"key\": 123}'))", SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added one file").isEqualTo(1L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1L, 123)), + sql("SELECT id, variant_get(data, '$.key', 'int') FROM %s", tableName)); + } + + @TestTemplate + public void testSnapshotPartitionedWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet PARTITIONED BY (id) LOCATION '%s'", + SOURCE_NAME, location); + sql( + "INSERT INTO TABLE %s (id, data) VALUES (1, parse_json('{\"key\": 123}')), (2, parse_json('{\"key\": 456}'))", + SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added two files").isEqualTo(2L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(123, 1L), row(456, 2L)), + sql("SELECT variant_get(data, '$.key', 'int'), id FROM %s ORDER BY id", tableName)); + } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index 3b4fc8f48786..0e9edac3fbd5 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -330,15 +330,15 @@ private static List listPartition( private static SparkPartition toSparkPartition( CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); - Option serde = partition.storage().serde(); + Option partitionSerde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); Preconditions.checkArgument( - serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); + partitionSerde.nonEmpty() || table.provider().nonEmpty(), + "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); - String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - + String format = resolveFileFormat(partitionSerde.getOrElse(() -> null), table); Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); @@ -683,11 +683,7 @@ private static void importUnpartitionedSparkTable( ExecutorService service) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); - Option format = - sourceTable.storage().serde().nonEmpty() - ? sourceTable.storage().serde() - : sourceTable.provider(); - Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); + String format = resolveFileFormat(null, sourceTable); Map partition = Collections.emptyMap(); PartitionSpec spec = PartitionSpec.unpartitioned(); @@ -701,7 +697,7 @@ private static void importUnpartitionedSparkTable( TableMigrationUtil.listPartition( partition, Util.uriToString(sourceTable.location()), - format.get(), + format, spec, conf, metricsConfig, @@ -1051,6 +1047,30 @@ public static boolean wapEnabled(Table table) { Boolean.parseBoolean(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); } + private static String resolveFileFormat(String partitionSerde, CatalogTable table) { + if (partitionSerde != null && isKnownFileFormat(partitionSerde)) { + return partitionSerde; + } + + Option serde = table.storage().serde(); + if (serde.nonEmpty() && isKnownFileFormat(serde.get())) { + return serde.get(); + } + + Preconditions.checkArgument( + table.provider().nonEmpty(), + "Could not determine table format from serde %s and no provider set", + serde.getOrElse(() -> "unknown")); + return table.provider().get(); + } + + private static boolean isKnownFileFormat(String serde) { + String lowerSerde = serde.toLowerCase(Locale.ROOT); + return lowerSerde.contains("parquet") + || lowerSerde.contains("avro") + || lowerSerde.contains("orc"); + } + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java index 19800c2f4666..3f8b574126ba 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestSnapshotTableProcedure.java @@ -375,4 +375,54 @@ public void testSnapshotPartitionedV1() throws IOException { } } } + + @TestTemplate + public void testSnapshotWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet LOCATION '%s'", + SOURCE_NAME, location); + sql("INSERT INTO TABLE %s VALUES (1, parse_json('{\"key\": 123}'))", SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added one file").isEqualTo(1L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(1L, 123)), + sql("SELECT id, variant_get(data, '$.key', 'int') FROM %s", tableName)); + } + + @TestTemplate + public void testSnapshotPartitionedWithVariant() throws IOException { + assumeThat(catalogName) + .as("Variant type requires Hive 4 which is not yet supported") + .isNotEqualTo("testhive") + .isNotEqualTo("spark_catalog"); + String location = Files.createTempDirectory(temp, "junit").toFile().toString(); + sql( + "CREATE TABLE %s (id bigint NOT NULL, data variant) USING parquet PARTITIONED BY (id) LOCATION '%s'", + SOURCE_NAME, location); + sql( + "INSERT INTO TABLE %s (id, data) VALUES (1, parse_json('{\"key\": 123}')), (2, parse_json('{\"key\": 456}'))", + SOURCE_NAME); + + Object result = + scalarSql( + "CALL %s.system.snapshot('%s', '%s', properties => map('format-version','3'))", + catalogName, SOURCE_NAME, tableName); + assertThat(result).as("Should have added two files").isEqualTo(2L); + + assertEquals( + "Should have expected rows", + ImmutableList.of(row(123, 1L), row(456, 2L)), + sql("SELECT variant_get(data, '$.key', 'int'), id FROM %s ORDER BY id", tableName)); + } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java index 04c47f49596d..96499184cab3 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkTableUtil.java @@ -329,15 +329,15 @@ private static List listPartition( private static SparkPartition toSparkPartition( CatalogTablePartition partition, CatalogTable table) { Option locationUri = partition.storage().locationUri(); - Option serde = partition.storage().serde(); + Option partitionSerde = partition.storage().serde(); Preconditions.checkArgument(locationUri.nonEmpty(), "Partition URI should be defined"); Preconditions.checkArgument( - serde.nonEmpty() || table.provider().nonEmpty(), "Partition format should be defined"); + partitionSerde.nonEmpty() || table.provider().nonEmpty(), + "Partition format should be defined"); String uri = Util.uriToString(locationUri.get()); - String format = serde.nonEmpty() ? serde.get() : table.provider().get(); - + String format = resolveFileFormat(partitionSerde.getOrElse(() -> null), table); Map partitionSpec = JavaConverters.mapAsJavaMapConverter(partition.spec()).asJava(); return new SparkPartition(partitionSpec, uri, format); @@ -682,11 +682,7 @@ private static void importUnpartitionedSparkTable( ExecutorService service) { try { CatalogTable sourceTable = spark.sessionState().catalog().getTableMetadata(sourceTableIdent); - Option format = - sourceTable.storage().serde().nonEmpty() - ? sourceTable.storage().serde() - : sourceTable.provider(); - Preconditions.checkArgument(format.nonEmpty(), "Could not determine table format"); + String format = resolveFileFormat(null, sourceTable); Map partition = Collections.emptyMap(); PartitionSpec spec = PartitionSpec.unpartitioned(); @@ -700,7 +696,7 @@ private static void importUnpartitionedSparkTable( TableMigrationUtil.listPartition( partition, Util.uriToString(sourceTable.location()), - format.get(), + format, spec, conf, metricsConfig, @@ -1143,6 +1139,30 @@ private static boolean wapEnabled(Table table) { Boolean.parseBoolean(TableProperties.WRITE_AUDIT_PUBLISH_ENABLED_DEFAULT)); } + private static String resolveFileFormat(String partitionSerde, CatalogTable table) { + if (partitionSerde != null && isKnownFileFormat(partitionSerde)) { + return partitionSerde; + } + + Option serde = table.storage().serde(); + if (serde.nonEmpty() && isKnownFileFormat(serde.get())) { + return serde.get(); + } + + Preconditions.checkArgument( + table.provider().nonEmpty(), + "Could not determine table format from serde %s and no provider set", + serde.getOrElse(() -> "unknown")); + return table.provider().get(); + } + + private static boolean isKnownFileFormat(String serde) { + String lowerSerde = serde.toLowerCase(Locale.ROOT); + return lowerSerde.contains("parquet") + || lowerSerde.contains("avro") + || lowerSerde.contains("orc"); + } + /** Class representing a table partition. */ public static class SparkPartition implements Serializable { private final Map values; From dd45bd926bed9bfdca28aa1221d05d39e3dcfd59 Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Tue, 28 Apr 2026 18:06:08 +0200 Subject: [PATCH 121/197] Flink: Bundle flink-metrics-dropwizard in runtime jar (#16126) Iceberg uses Dropwizard metrics for Hisograms. Flink does not ship this optional dependency by default. In order for histograms to continue to work, we should add back the runtime dependency removed in #16093. --- docs/docs/flink-writes.md | 4 ++++ flink/v2.1/build.gradle | 5 ++++- flink/v2.1/flink-runtime/LICENSE | 10 +++++++++- flink/v2.1/flink-runtime/runtime-deps.txt | 2 ++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/docs/docs/flink-writes.md b/docs/docs/flink-writes.md index 09fa22b640c7..c904635d0dbb 100644 --- a/docs/docs/flink-writes.md +++ b/docs/docs/flink-writes.md @@ -207,6 +207,10 @@ They should have the following key-value tags. | dataFilesSizeHistogram | Histogram | Histogram distribution of data file sizes (in bytes). | | deleteFilesSizeHistogram | Histogram | Histogram distribution of delete file sizes (in bytes). | +The `Histogram` metrics above require `flink-metrics-dropwizard` on the classpath, which is not shipped +by Flink by default. When using `iceberg-flink-runtime`, this dependency is already bundled. When using +the `iceberg-flink` artifact directly, add `org.apache.flink:flink-metrics-dropwizard` as a dependency. + Committer metrics are added under the sub group of `IcebergFilesCommitter`. They should have the following key-value tags. diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index 53f87f27aa67..6dc373e6b566 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -33,7 +33,7 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink21.avro - // for dropwizard histogram metrics implementation + // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink21.metrics.dropwizard compileOnly libs.flink21.streaming.java compileOnly "${libs.flink21.streaming.java.get().module}:${libs.flink21.streaming.java.get().getVersion()}:tests" @@ -169,6 +169,9 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } + // To support dropwizard histogram metrics (not shipped by Flink by default) + implementation libs.flink21.metrics.dropwizard + // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') diff --git a/flink/v2.1/flink-runtime/LICENSE b/flink/v2.1/flink-runtime/LICENSE index 36a03cb4fcf9..e8c4c4a0bdf7 100644 --- a/flink/v2.1/flink-runtime/LICENSE +++ b/flink/v2.1/flink-runtime/LICENSE @@ -556,7 +556,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Codahale Metrics. +This product bundles Dropwizard Metrics. Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team Project URL: https://github.com/dropwizard/metrics @@ -564,6 +564,14 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Apache Flink's optional support for Dropwizard Metrics. + +Copyright: 2014-2026 The Apache Software Foundation +Project URL: https://flink.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors diff --git a/flink/v2.1/flink-runtime/runtime-deps.txt b/flink/v2.1/flink-runtime/runtime-deps.txt index 00c53ed388d0..3dfc56f15ea9 100644 --- a/flink/v2.1/flink-runtime/runtime-deps.txt +++ b/flink/v2.1/flink-runtime/runtime-deps.txt @@ -6,9 +6,11 @@ com.github.luben:zstd-jni:1.5.7-3 com.google.errorprone:error_prone_annotations:2.10.0 dev.failsafe:failsafe:3.3.2 io.airlift:aircompressor:2.0.3 +io.dropwizard.metrics:metrics-core:3.2.6 org.apache.avro:avro:1.12.1 org.apache.datasketches:datasketches-java:6.2.0 org.apache.datasketches:datasketches-memory:3.0.2 +org.apache.flink:flink-metrics-dropwizard:2.1.0 org.apache.httpcomponents.client5:httpclient5:5.6 org.apache.httpcomponents.core5:httpcore5-h2:5.4 org.apache.httpcomponents.core5:httpcore5:5.4 From 4880f5bc3ec2db8610ab671250caf7a4182c9e7d Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Tue, 28 Apr 2026 10:20:40 -0700 Subject: [PATCH 122/197] Flink 2.1: Update LICENSE for 1.11. (#16102) * Flink 2.1: Update LICENSE for 1.11. * Flink 2.1: Update NOTICE following LICENSE changes. * Flink 2.1: Add source license updates from Parquet. * Flink 2.1: Add Hive storage API and protobuf to LICENSE. --- flink/v2.1/flink-runtime/LICENSE | 406 +++---------------------------- flink/v2.1/flink-runtime/NOTICE | 122 ---------- 2 files changed, 39 insertions(+), 489 deletions(-) diff --git a/flink/v2.1/flink-runtime/LICENSE b/flink/v2.1/flink-runtime/LICENSE index e8c4c4a0bdf7..11460c3307c8 100644 --- a/flink/v2.1/flink-runtime/LICENSE +++ b/flink/v2.1/flink-runtime/LICENSE @@ -227,7 +227,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2010 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,55 +235,57 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). -Copyright: 2002-2014 Sebastiano Vigna -Project URL: http://fastutil.di.unimi.it/ +Copyright: 2013 Daniel Lemire +Project URL: https://github.com/lemire/JavaFastPFOR License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache ORC. +This product bundles Fastutil (bundled by Parquet). -Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://orc.apache.org/ +Copyright: 2002-2014 Sebastiano Vigna +Project URL: http://fastutil.di.unimi.it/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Zero-Allocation Hashing (bundled by Parquet). -Project URL: https://datasketches.apache.org/ +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache ORC. Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://hive.apache.org/ +Project URL: https://orc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Airlift Aircompressor. +This product bundles Apache Hive's Storage API (bundled by ORC). -Copyright: 2011-2020 Aircompressor authors. -Project URL: https://github.com/airlift/aircompressor +Copyright: 2008-2020 The Apache Software Foundation. +Project URL: https://hive.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Google GAX. +This product bundles Google protobuf (bundled by ORC). -Project URL: https://github.com/googleapis/gax-java +Copyright: 2008 Google Inc. +Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| + +| Copyright 2008 Google Inc. All rights reserved. +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -293,7 +295,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -305,40 +307,26 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. -------------------------------------------------------------------------------- -This product bundles Google Auth Library. +This product bundles Apache Datasketches. -License: BSD 3-Clause -| Copyright 2014, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Project URL: https://datasketches.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Project URL: https://github.com/airlift/aircompressor +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -401,82 +389,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. - -Copyright: 2008 Google Inc. -Project URL: https://developers.google.com/protocol-buffers -License: BSD 3-Clause -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. @@ -530,15 +442,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (core and client). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java @@ -622,143 +526,6 @@ License: BSD 2-Clause -------------------------------------------------------------------------------- -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google API Common. - -License: BSD 3-Clause -| Copyright 2016, Google Inc. -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Http Client. - -Project URL: https://www.google.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles Okio. - -Project URL: https://github.com/square/okio -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Netty. - -Project URL: https://netty.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google APIs. - -Project URL: https://github.com/googleapis/googleapis -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud APIs for Java. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts @@ -766,98 +533,3 @@ License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/e -------------------------------------------------------------------------------- -This product bundles javax.annotation-api. - -Project URL: https://javaee.github.io/glassfish -Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles Apache Arrow. - -Project URL: https://github.com/apache/arrow -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google flatbuffers. - -Project URL: https://github.com/google/flatbuffers -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v2.1/flink-runtime/NOTICE b/flink/v2.1/flink-runtime/NOTICE index 72916788b5e4..61b02129d0e1 100644 --- a/flink/v2.1/flink-runtime/NOTICE +++ b/flink/v2.1/flink-runtime/NOTICE @@ -356,128 +356,6 @@ This product bundles Eclipse Microprofile OpenAPI with the following in its NOTI -------------------------------------------------------------------------------- -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa - --------------------------------------------------------------------------------- - -This product bundles Netty with the following in its NOTICE file: -| -| The Netty Project -| ================= -| -| Please visit the Netty web site for more information: -| -| * http://netty.io/ -| -| Copyright 2016 The Netty Project -| -| The Netty Project licenses this file to you under the Apache License, -| version 2.0 (the "License"); you may not use this file except in compliance -| with the License. You may obtain a copy of the License at: -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| License for the specific language governing permissions and limitations -| under the License. -| -| ------------------------------------------------------------------------------- -| This product contains a forked and modified version of Tomcat Native -| -| * LICENSE: -| * license/LICENSE.tomcat-native.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://tomcat.apache.org/native-doc/ -| * https://svn.apache.org/repos/asf/tomcat/native/ -| -| This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| -| * LICENSE: -| * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/takari/maven-wrapper -| -| This product contains small piece of code to support AIX, taken from netbsd. -| -| * LICENSE: -| * license/LICENSE.aix-netbsd.txt (OpenSSL License) -| * HOMEPAGE: -| * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| -| This product contains code from boringssl. -| -| * LICENSE (Combination ISC and OpenSSL license) -| * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) -| * HOMEPAGE: -| * https://boringssl.googlesource.com/boringssl/ - --------------------------------------------------------------------------------- - This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor | From 9b139c99d8ee1bb4991025ab95539eb2274952d0 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Tue, 28 Apr 2026 10:44:07 -0700 Subject: [PATCH 123/197] Spark: Carry over changes to LICENSE and NOTICE in older Spark versions. (#16142) --- spark/v3.4/spark-runtime/LICENSE | 276 +++---------------------------- spark/v3.4/spark-runtime/NOTICE | 101 ----------- spark/v3.5/spark-runtime/LICENSE | 276 +++---------------------------- spark/v3.5/spark-runtime/NOTICE | 101 ----------- spark/v4.0/spark-runtime/LICENSE | 276 +++---------------------------- spark/v4.0/spark-runtime/NOTICE | 101 ----------- 6 files changed, 60 insertions(+), 1071 deletions(-) diff --git a/spark/v3.4/spark-runtime/LICENSE b/spark/v3.4/spark-runtime/LICENSE index a67296eb412c..24a9e3706d17 100644 --- a/spark/v3.4/spark-runtime/LICENSE +++ b/spark/v3.4/spark-runtime/LICENSE @@ -227,7 +227,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +235,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +243,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +251,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +266,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +274,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -390,20 +398,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -428,42 +422,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. @@ -540,19 +498,11 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -573,16 +523,16 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -594,195 +544,9 @@ License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/e -------------------------------------------------------------------------------- -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v3.4/spark-runtime/NOTICE b/spark/v3.4/spark-runtime/NOTICE index 68abd73906b1..17989b43a371 100644 --- a/spark/v3.4/spark-runtime/NOTICE +++ b/spark/v3.4/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -392,68 +356,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | Arthur De Magalhaes arthurdm@ca.ibm.com | --------------------------------------------------------------------------------- - -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa diff --git a/spark/v3.5/spark-runtime/LICENSE b/spark/v3.5/spark-runtime/LICENSE index a67296eb412c..24a9e3706d17 100644 --- a/spark/v3.5/spark-runtime/LICENSE +++ b/spark/v3.5/spark-runtime/LICENSE @@ -227,7 +227,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +235,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +243,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +251,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +266,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +274,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -390,20 +398,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -428,42 +422,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. @@ -540,19 +498,11 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -573,16 +523,16 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -594,195 +544,9 @@ License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/e -------------------------------------------------------------------------------- -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v3.5/spark-runtime/NOTICE b/spark/v3.5/spark-runtime/NOTICE index 68abd73906b1..17989b43a371 100644 --- a/spark/v3.5/spark-runtime/NOTICE +++ b/spark/v3.5/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -392,68 +356,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | Arthur De Magalhaes arthurdm@ca.ibm.com | --------------------------------------------------------------------------------- - -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa diff --git a/spark/v4.0/spark-runtime/LICENSE b/spark/v4.0/spark-runtime/LICENSE index a67296eb412c..24a9e3706d17 100644 --- a/spark/v4.0/spark-runtime/LICENSE +++ b/spark/v4.0/spark-runtime/LICENSE @@ -227,7 +227,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2017 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,7 +235,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). Copyright: 2013 Daniel Lemire Project URL: https://github.com/lemire/JavaFastPFOR @@ -243,7 +243,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles fastutil. +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -251,6 +251,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Apache ORC. Copyright: 2013 and onwards The Apache Software Foundation. @@ -259,7 +266,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache Hive's Storage API (bundled by ORC). Copyright: 2008-2020 The Apache Software Foundation Project URL: https://hive.apache.org/ @@ -267,11 +274,12 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. +This product bundles Google protobuf (bundled by ORC). Copyright: 2008 Google Inc. Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. | | Redistribution and use in source and binary forms, with or without @@ -390,20 +398,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - This product bundles Apache Arrow. Copyright: 2016-2019 The Apache Software Foundation. @@ -428,42 +422,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. @@ -540,19 +498,11 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (client and core). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java -Copyright: 1999-2022 The Apache Software Foundation. +Copyright: 1999-2022 The Apache Software Foundation Project URL: https://hc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -573,16 +523,16 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Eclipse Collections. -Project URL: https://datasketches.apache.org -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 +Project URL: https://github.com/eclipse-collections/eclipse-collections +License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html -------------------------------------------------------------------------------- -This product bundles Zero-Allocation Hashing. +This product bundles Apache Datasketches. -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing +Project URL: https://datasketches.apache.org License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -594,195 +544,9 @@ License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/e -------------------------------------------------------------------------------- -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud Java Client Libraries. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google GAX. - -Project URL: https://github.com/googleapis/gax-java -License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles EMMA runtime. - -Project URL: https://github.com/ehelms/Emma/ -License: Common Public License - v 1.0 - --------------------------------------------------------------------------------- - -This product bundles Google j2objc. - -Project URL: https://github.com/google/j2objc/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/spark/v4.0/spark-runtime/NOTICE b/spark/v4.0/spark-runtime/NOTICE index 68abd73906b1..17989b43a371 100644 --- a/spark/v4.0/spark-runtime/NOTICE +++ b/spark/v4.0/spark-runtime/NOTICE @@ -66,42 +66,6 @@ This product bundles Airlift Aircompressor with the following in its NOTICE file -------------------------------------------------------------------------------- -This product bundles Google Protobuf with the following in its NOTICE file: -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - This product bundles Netty with the following in its NOTICE file: | The Netty Project | ================= @@ -392,68 +356,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | Arthur De Magalhaes arthurdm@ca.ibm.com | --------------------------------------------------------------------------------- - -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa From b0df3ca01d61b2f7ae7143ac660c6b16e33b6e46 Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Wed, 29 Apr 2026 11:05:57 +0900 Subject: [PATCH 124/197] Build: Bump software.amazon.awssdk:bom from 2.42.33 to 2.42.36 (#16151) Bumps software.amazon.awssdk:bom from 2.42.33 to 2.42.36. --- updated-dependencies: - dependency-name: software.amazon.awssdk:bom dependency-version: 2.42.36 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- aws-bundle/runtime-deps.txt | 90 +++++++++---------- gradle/libs.versions.toml | 2 +- .../kafka-connect-runtime/runtime-deps.txt | 80 ++++++++--------- 3 files changed, 86 insertions(+), 86 deletions(-) diff --git a/aws-bundle/runtime-deps.txt b/aws-bundle/runtime-deps.txt index fc2514d5373d..730259fca97b 100644 --- a/aws-bundle/runtime-deps.txt +++ b/aws-bundle/runtime-deps.txt @@ -16,51 +16,51 @@ org.apache.httpcomponents:httpclient:4.5.13 org.apache.httpcomponents:httpcore:4.4.16 org.checkerframework:checker-qual:3.19.0 org.reactivestreams:reactive-streams:1.0.4 -software.amazon.awssdk.crt:aws-crt:0.43.9 -software.amazon.awssdk:annotations:2.42.33 -software.amazon.awssdk:apache-client:2.42.33 -software.amazon.awssdk:arns:2.42.33 -software.amazon.awssdk:auth:2.42.33 -software.amazon.awssdk:aws-core:2.42.33 -software.amazon.awssdk:aws-json-protocol:2.42.33 -software.amazon.awssdk:aws-query-protocol:2.42.33 -software.amazon.awssdk:aws-xml-protocol:2.42.33 -software.amazon.awssdk:checksums-spi:2.42.33 -software.amazon.awssdk:checksums:2.42.33 -software.amazon.awssdk:cloudwatch-metric-publisher:2.42.33 -software.amazon.awssdk:cloudwatch:2.42.33 -software.amazon.awssdk:crt-core:2.42.33 -software.amazon.awssdk:dynamodb:2.42.33 -software.amazon.awssdk:endpoints-spi:2.42.33 -software.amazon.awssdk:glue:2.42.33 -software.amazon.awssdk:http-auth-aws-crt:2.42.33 -software.amazon.awssdk:http-auth-aws-eventstream:2.42.33 -software.amazon.awssdk:http-auth-aws:2.42.33 -software.amazon.awssdk:http-auth-spi:2.42.33 -software.amazon.awssdk:http-auth:2.42.33 -software.amazon.awssdk:http-client-spi:2.42.33 -software.amazon.awssdk:iam:2.42.33 -software.amazon.awssdk:identity-spi:2.42.33 -software.amazon.awssdk:json-utils:2.42.33 -software.amazon.awssdk:kms:2.42.33 -software.amazon.awssdk:lakeformation:2.42.33 -software.amazon.awssdk:metrics-spi:2.42.33 -software.amazon.awssdk:netty-nio-client:2.42.33 -software.amazon.awssdk:profiles:2.42.33 -software.amazon.awssdk:protocol-core:2.42.33 -software.amazon.awssdk:regions:2.42.33 -software.amazon.awssdk:retries-spi:2.42.33 -software.amazon.awssdk:retries:2.42.33 -software.amazon.awssdk:s3:2.42.33 -software.amazon.awssdk:s3control:2.42.33 -software.amazon.awssdk:sdk-core:2.42.33 -software.amazon.awssdk:smithy-rpcv2-protocol:2.42.33 -software.amazon.awssdk:sso:2.42.33 -software.amazon.awssdk:sts:2.42.33 -software.amazon.awssdk:third-party-jackson-core:2.42.33 -software.amazon.awssdk:third-party-jackson-dataformat-cbor:2.42.33 -software.amazon.awssdk:utils-lite:2.42.33 -software.amazon.awssdk:utils:2.42.33 +software.amazon.awssdk.crt:aws-crt:0.44.0 +software.amazon.awssdk:annotations:2.42.36 +software.amazon.awssdk:apache-client:2.42.36 +software.amazon.awssdk:arns:2.42.36 +software.amazon.awssdk:auth:2.42.36 +software.amazon.awssdk:aws-core:2.42.36 +software.amazon.awssdk:aws-json-protocol:2.42.36 +software.amazon.awssdk:aws-query-protocol:2.42.36 +software.amazon.awssdk:aws-xml-protocol:2.42.36 +software.amazon.awssdk:checksums-spi:2.42.36 +software.amazon.awssdk:checksums:2.42.36 +software.amazon.awssdk:cloudwatch-metric-publisher:2.42.36 +software.amazon.awssdk:cloudwatch:2.42.36 +software.amazon.awssdk:crt-core:2.42.36 +software.amazon.awssdk:dynamodb:2.42.36 +software.amazon.awssdk:endpoints-spi:2.42.36 +software.amazon.awssdk:glue:2.42.36 +software.amazon.awssdk:http-auth-aws-crt:2.42.36 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.36 +software.amazon.awssdk:http-auth-aws:2.42.36 +software.amazon.awssdk:http-auth-spi:2.42.36 +software.amazon.awssdk:http-auth:2.42.36 +software.amazon.awssdk:http-client-spi:2.42.36 +software.amazon.awssdk:iam:2.42.36 +software.amazon.awssdk:identity-spi:2.42.36 +software.amazon.awssdk:json-utils:2.42.36 +software.amazon.awssdk:kms:2.42.36 +software.amazon.awssdk:lakeformation:2.42.36 +software.amazon.awssdk:metrics-spi:2.42.36 +software.amazon.awssdk:netty-nio-client:2.42.36 +software.amazon.awssdk:profiles:2.42.36 +software.amazon.awssdk:protocol-core:2.42.36 +software.amazon.awssdk:regions:2.42.36 +software.amazon.awssdk:retries-spi:2.42.36 +software.amazon.awssdk:retries:2.42.36 +software.amazon.awssdk:s3:2.42.36 +software.amazon.awssdk:s3control:2.42.36 +software.amazon.awssdk:sdk-core:2.42.36 +software.amazon.awssdk:smithy-rpcv2-protocol:2.42.36 +software.amazon.awssdk:sso:2.42.36 +software.amazon.awssdk:sts:2.42.36 +software.amazon.awssdk:third-party-jackson-core:2.42.36 +software.amazon.awssdk:third-party-jackson-dataformat-cbor:2.42.36 +software.amazon.awssdk:utils-lite:2.42.36 +software.amazon.awssdk:utils:2.42.36 software.amazon.eventstream:eventstream:1.0.1 software.amazon.s3.accessgrants:aws-s3-accessgrants-java-plugin:2.4.1 software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3:1.3.1 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index fad41cf94a23..7b579affa39b 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,7 +33,7 @@ arrow = "15.0.2" avro = "1.12.1" assertj-core = "3.27.7" awaitility = "4.3.0" -awssdk-bom = "2.42.33" +awssdk-bom = "2.42.36" azuresdk-bom = "1.3.6" awssdk-s3accessgrants = "2.4.1" bouncycastle = "1.84" diff --git a/kafka-connect/kafka-connect-runtime/runtime-deps.txt b/kafka-connect/kafka-connect-runtime/runtime-deps.txt index 98b7ced14217..3da8c1fe8e3e 100644 --- a/kafka-connect/kafka-connect-runtime/runtime-deps.txt +++ b/kafka-connect/kafka-connect-runtime/runtime-deps.txt @@ -190,44 +190,44 @@ org.slf4j:slf4j-api:2.0.17 org.threeten:threeten-extra:1.8.0 org.threeten:threetenbp:1.7.0 org.xerial.snappy:snappy-java:1.1.10.8 -software.amazon.awssdk.crt:aws-crt:0.43.9 -software.amazon.awssdk:annotations:2.42.33 -software.amazon.awssdk:apache-client:2.42.33 -software.amazon.awssdk:arns:2.42.33 -software.amazon.awssdk:auth:2.42.33 -software.amazon.awssdk:aws-core:2.42.33 -software.amazon.awssdk:aws-json-protocol:2.42.33 -software.amazon.awssdk:aws-query-protocol:2.42.33 -software.amazon.awssdk:aws-xml-protocol:2.42.33 -software.amazon.awssdk:checksums-spi:2.42.33 -software.amazon.awssdk:checksums:2.42.33 -software.amazon.awssdk:crt-core:2.42.33 -software.amazon.awssdk:dynamodb:2.42.33 -software.amazon.awssdk:endpoints-spi:2.42.33 -software.amazon.awssdk:glue:2.42.33 -software.amazon.awssdk:http-auth-aws-crt:2.42.33 -software.amazon.awssdk:http-auth-aws-eventstream:2.42.33 -software.amazon.awssdk:http-auth-aws:2.42.33 -software.amazon.awssdk:http-auth-spi:2.42.33 -software.amazon.awssdk:http-auth:2.42.33 -software.amazon.awssdk:http-client-spi:2.42.33 -software.amazon.awssdk:iam:2.42.33 -software.amazon.awssdk:identity-spi:2.42.33 -software.amazon.awssdk:json-utils:2.42.33 -software.amazon.awssdk:kms:2.42.33 -software.amazon.awssdk:lakeformation:2.42.33 -software.amazon.awssdk:metrics-spi:2.42.33 -software.amazon.awssdk:netty-nio-client:2.42.33 -software.amazon.awssdk:profiles:2.42.33 -software.amazon.awssdk:protocol-core:2.42.33 -software.amazon.awssdk:regions:2.42.33 -software.amazon.awssdk:retries-spi:2.42.33 -software.amazon.awssdk:retries:2.42.33 -software.amazon.awssdk:s3:2.42.33 -software.amazon.awssdk:sdk-core:2.42.33 -software.amazon.awssdk:sso:2.42.33 -software.amazon.awssdk:sts:2.42.33 -software.amazon.awssdk:third-party-jackson-core:2.42.33 -software.amazon.awssdk:utils-lite:2.42.33 -software.amazon.awssdk:utils:2.42.33 +software.amazon.awssdk.crt:aws-crt:0.44.0 +software.amazon.awssdk:annotations:2.42.36 +software.amazon.awssdk:apache-client:2.42.36 +software.amazon.awssdk:arns:2.42.36 +software.amazon.awssdk:auth:2.42.36 +software.amazon.awssdk:aws-core:2.42.36 +software.amazon.awssdk:aws-json-protocol:2.42.36 +software.amazon.awssdk:aws-query-protocol:2.42.36 +software.amazon.awssdk:aws-xml-protocol:2.42.36 +software.amazon.awssdk:checksums-spi:2.42.36 +software.amazon.awssdk:checksums:2.42.36 +software.amazon.awssdk:crt-core:2.42.36 +software.amazon.awssdk:dynamodb:2.42.36 +software.amazon.awssdk:endpoints-spi:2.42.36 +software.amazon.awssdk:glue:2.42.36 +software.amazon.awssdk:http-auth-aws-crt:2.42.36 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.36 +software.amazon.awssdk:http-auth-aws:2.42.36 +software.amazon.awssdk:http-auth-spi:2.42.36 +software.amazon.awssdk:http-auth:2.42.36 +software.amazon.awssdk:http-client-spi:2.42.36 +software.amazon.awssdk:iam:2.42.36 +software.amazon.awssdk:identity-spi:2.42.36 +software.amazon.awssdk:json-utils:2.42.36 +software.amazon.awssdk:kms:2.42.36 +software.amazon.awssdk:lakeformation:2.42.36 +software.amazon.awssdk:metrics-spi:2.42.36 +software.amazon.awssdk:netty-nio-client:2.42.36 +software.amazon.awssdk:profiles:2.42.36 +software.amazon.awssdk:protocol-core:2.42.36 +software.amazon.awssdk:regions:2.42.36 +software.amazon.awssdk:retries-spi:2.42.36 +software.amazon.awssdk:retries:2.42.36 +software.amazon.awssdk:s3:2.42.36 +software.amazon.awssdk:sdk-core:2.42.36 +software.amazon.awssdk:sso:2.42.36 +software.amazon.awssdk:sts:2.42.36 +software.amazon.awssdk:third-party-jackson-core:2.42.36 +software.amazon.awssdk:utils-lite:2.42.36 +software.amazon.awssdk:utils:2.42.36 software.amazon.eventstream:eventstream:1.0.1 From 099ef477c8024b3bcb4d12abca18944c25b0eb45 Mon Sep 17 00:00:00 2001 From: Hongyue/Steve Zhang Date: Tue, 28 Apr 2026 20:11:36 -0700 Subject: [PATCH 125/197] Core: Validate v2 deletes against concurrent format upgrade (#16146) * Core: validate buffered v2 deletes against concurrent format upgrade * rename to validateDeleteFilesForVersion --- .../iceberg/MergingSnapshotProducer.java | 18 +++++++++++++--- .../java/org/apache/iceberg/TestRowDelta.java | 21 +++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java index eed4e56dc05a..e072382543b7 100644 --- a/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java +++ b/core/src/main/java/org/apache/iceberg/MergingSnapshotProducer.java @@ -289,7 +289,11 @@ private void addInternal(DeleteFile file) { protected void validateNewDeleteFile(DeleteFile file) { Preconditions.checkNotNull(file, "Invalid delete file: null"); - switch (formatVersion()) { + validateDeleteFileForVersion(file, formatVersion()); + } + + private static void validateDeleteFileForVersion(DeleteFile file, int formatVersion) { + switch (formatVersion) { case 1: throw new IllegalArgumentException("Deletes are supported in V2 and above"); case 2: @@ -303,11 +307,11 @@ protected void validateNewDeleteFile(DeleteFile file) { Preconditions.checkArgument( file.content() == FileContent.EQUALITY_DELETES || ContentFileUtil.isDV(file), "Must use DVs for position deletes in V%s: %s", - formatVersion(), + formatVersion, file.location()); break; default: - throw new IllegalArgumentException("Unsupported format version: " + formatVersion()); + throw new IllegalArgumentException("Unsupported format version: " + formatVersion); } } @@ -959,8 +963,16 @@ protected Map summary() { return summaryBuilder.build(); } + // guard buffered deletes against concurrent format upgrade + private void validateDeleteFilesForVersion(int currentFormatVersion) { + for (DeleteFile file : v2Deletes) { + validateDeleteFileForVersion(file, currentFormatVersion); + } + } + @Override public List apply(TableMetadata base, Snapshot snapshot) { + validateDeleteFilesForVersion(base.formatVersion()); // filter any existing manifests List filtered = filterManager.filterManifests( diff --git a/core/src/test/java/org/apache/iceberg/TestRowDelta.java b/core/src/test/java/org/apache/iceberg/TestRowDelta.java index c442541289a9..aaccf4122481 100644 --- a/core/src/test/java/org/apache/iceberg/TestRowDelta.java +++ b/core/src/test/java/org/apache/iceberg/TestRowDelta.java @@ -2419,6 +2419,27 @@ public void testManifestMergingAfterUpgradeToV3() { assertThat(taskDV.contentSizeInBytes()).isEqualTo(dv.contentSizeInBytes()); } + @TestTemplate + public void testV2StagedPositionDeleteCannotCommitToV3() { + assumeThat(formatVersion).isEqualTo(2); + + Snapshot initial = commit(table, table.newAppend().appendFile(FILE_A), branch); + + // Stage RowDelta at v2: position delete for FILE_A + add new data FILE_B. + RowDelta rowDelta = table.newRowDelta().addDeletes(FILE_A_DELETES).addRows(FILE_B); + + // upgrade the table + table.updateProperties().set(TableProperties.FORMAT_VERSION, "3").commit(); + + assertThatThrownBy(rowDelta::commit) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Must use DVs for position deletes in V3"); + + table.refresh(); + assertThat(table.operations().current().formatVersion()).isEqualTo(3); + assertThat(table.snapshot(branch)).isEqualTo(initial); + } + @TestTemplate public void testInabilityToAddPositionDeleteFilesInTablesWithDVs() { assumeThat(formatVersion).isGreaterThanOrEqualTo(3); From 8ac703067a2adfae1928748712dc1d47dbc3c22b Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Wed, 29 Apr 2026 14:36:21 +0900 Subject: [PATCH 126/197] Build: Bump com.google.cloud:libraries-bom from 26.79.0 to 26.80.0 (#16152) Bumps [com.google.cloud:libraries-bom](https://github.com/googleapis/java-cloud-bom) from 26.79.0 to 26.80.0. - [Release notes](https://github.com/googleapis/java-cloud-bom/releases) - [Commits](https://github.com/googleapis/java-cloud-bom/compare/v26.79.0...v26.80.0) --- updated-dependencies: - dependency-name: com.google.cloud:libraries-bom dependency-version: 26.80.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gcp-bundle/runtime-deps.txt | 106 +++++++++--------- gradle/libs.versions.toml | 2 +- .../kafka-connect-runtime/runtime-deps.txt | 90 +++++++-------- 3 files changed, 99 insertions(+), 99 deletions(-) diff --git a/gcp-bundle/runtime-deps.txt b/gcp-bundle/runtime-deps.txt index 2c3331f5261b..9e471a7841d4 100644 --- a/gcp-bundle/runtime-deps.txt +++ b/gcp-bundle/runtime-deps.txt @@ -1,50 +1,50 @@ -com.fasterxml.jackson.core:jackson-annotations:2.18.2 -com.fasterxml.jackson.core:jackson-core:2.18.2 -com.fasterxml.jackson.core:jackson-databind:2.18.2 -com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.18.2 -com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.18.2 +com.fasterxml.jackson.core:jackson-annotations:2.18.3 +com.fasterxml.jackson.core:jackson-core:2.18.3 +com.fasterxml.jackson.core:jackson-databind:2.18.3 +com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.18.3 +com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.18.3 com.fasterxml.woodstox:woodstox-core:7.0.0 com.google.android:annotations:4.1.1.4 com.google.api-client:google-api-client:2.7.2 -com.google.api.grpc:gapic-google-cloud-storage-v2:2.64.1 -com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.24.0 -com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.196.0 -com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.196.0 -com.google.api.grpc:grpc-google-cloud-storage-v2:2.64.1 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.24.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.24.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.196.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.196.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.24.0 -com.google.api.grpc:proto-google-cloud-kms-v1:0.182.0 -com.google.api.grpc:proto-google-cloud-monitoring-v3:3.89.0 -com.google.api.grpc:proto-google-cloud-storage-v2:2.64.1 -com.google.api.grpc:proto-google-common-protos:2.67.0 -com.google.api.grpc:proto-google-iam-v1:1.62.0 -com.google.api:api-common:2.59.0 -com.google.api:gax-grpc:2.76.0 -com.google.api:gax-httpjson:2.76.0 -com.google.api:gax:2.76.0 +com.google.api.grpc:gapic-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:grpc-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.27.0 +com.google.api.grpc:proto-google-cloud-kms-v1:0.185.0 +com.google.api.grpc:proto-google-cloud-monitoring-v3:3.92.0 +com.google.api.grpc:proto-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-common-protos:2.70.0 +com.google.api.grpc:proto-google-iam-v1:1.65.0 +com.google.api:api-common:2.62.0 +com.google.api:gax-grpc:2.79.0 +com.google.api:gax-httpjson:2.79.0 +com.google.api:gax:2.79.0 com.google.apis:google-api-services-bigquery:v2-rev20251012-2.0.0 com.google.apis:google-api-services-storage:v1-rev20260204-2.0.0 -com.google.auth:google-auth-library-credentials:1.43.0 -com.google.auth:google-auth-library-oauth2-http:1.43.0 +com.google.auth:google-auth-library-credentials:1.46.0 +com.google.auth:google-auth-library-oauth2-http:1.46.0 com.google.auto.value:auto-value-annotations:1.11.1 com.google.cloud.gcs.analytics:client:1.2.3 com.google.cloud.gcs.analytics:gcs-analytics-core:1.2.3 com.google.cloud.opentelemetry:detector-resources-support:0.33.0 com.google.cloud.opentelemetry:exporter-metrics:0.33.0 com.google.cloud.opentelemetry:shared-resourcemapping:0.33.0 -com.google.cloud:google-cloud-bigquery:2.62.0 -com.google.cloud:google-cloud-bigquerystorage:3.24.0 -com.google.cloud:google-cloud-core-grpc:2.66.0 -com.google.cloud:google-cloud-core-http:2.66.0 -com.google.cloud:google-cloud-core:2.66.0 -com.google.cloud:google-cloud-kms:2.91.0 -com.google.cloud:google-cloud-monitoring:3.89.0 -com.google.cloud:google-cloud-storage:2.64.1 +com.google.cloud:google-cloud-bigquery:2.65.0 +com.google.cloud:google-cloud-bigquerystorage:3.27.0 +com.google.cloud:google-cloud-core-grpc:2.69.0 +com.google.cloud:google-cloud-core-http:2.69.0 +com.google.cloud:google-cloud-core:2.69.0 +com.google.cloud:google-cloud-kms:2.94.0 +com.google.cloud:google-cloud-monitoring:3.92.0 +com.google.cloud:google-cloud-storage:2.67.0 com.google.code.gson:gson:2.12.1 -com.google.errorprone:error_prone_annotations:2.42.0 +com.google.errorprone:error_prone_annotations:2.45.0 com.google.flatbuffers:flatbuffers-java:24.3.25 com.google.guava:failureaccess:1.0.3 com.google.guava:guava:33.5.0-jre @@ -60,23 +60,23 @@ com.google.protobuf:protobuf-java-util:4.33.2 com.google.protobuf:protobuf-java:4.33.2 com.google.re2j:re2j:1.8 commons-codec:commons-codec:1.18.0 -io.grpc:grpc-alts:1.76.3 -io.grpc:grpc-api:1.76.3 -io.grpc:grpc-auth:1.76.3 -io.grpc:grpc-context:1.76.3 -io.grpc:grpc-core:1.76.3 -io.grpc:grpc-googleapis:1.76.3 -io.grpc:grpc-grpclb:1.76.3 -io.grpc:grpc-inprocess:1.76.3 -io.grpc:grpc-netty-shaded:1.76.3 -io.grpc:grpc-opentelemetry:1.76.3 -io.grpc:grpc-protobuf-lite:1.76.3 -io.grpc:grpc-protobuf:1.76.3 -io.grpc:grpc-rls:1.76.3 -io.grpc:grpc-services:1.76.3 -io.grpc:grpc-stub:1.76.3 -io.grpc:grpc-util:1.76.3 -io.grpc:grpc-xds:1.76.3 +io.grpc:grpc-alts:1.80.0 +io.grpc:grpc-api:1.80.0 +io.grpc:grpc-auth:1.80.0 +io.grpc:grpc-context:1.80.0 +io.grpc:grpc-core:1.80.0 +io.grpc:grpc-googleapis:1.80.0 +io.grpc:grpc-grpclb:1.80.0 +io.grpc:grpc-inprocess:1.80.0 +io.grpc:grpc-netty-shaded:1.80.0 +io.grpc:grpc-opentelemetry:1.80.0 +io.grpc:grpc-protobuf-lite:1.80.0 +io.grpc:grpc-protobuf:1.80.0 +io.grpc:grpc-rls:1.80.0 +io.grpc:grpc-services:1.80.0 +io.grpc:grpc-stub:1.80.0 +io.grpc:grpc-util:1.80.0 +io.grpc:grpc-xds:1.80.0 io.netty:netty-buffer:4.1.110.Final io.netty:netty-common:4.1.110.Final io.opencensus:opencensus-api:0.31.1 @@ -103,7 +103,7 @@ org.apache.httpcomponents:httpclient:4.5.14 org.apache.httpcomponents:httpcore:4.4.16 org.checkerframework:checker-compat-qual:2.5.6 org.checkerframework:checker-qual:3.49.0 -org.codehaus.mojo:animal-sniffer-annotations:1.24 +org.codehaus.mojo:animal-sniffer-annotations:1.26 org.codehaus.woodstox:stax2-api:4.2.2 org.conscrypt:conscrypt-openjdk-uber:2.5.2 org.json:json:20250517 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 7b579affa39b..8b24d4727436 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -51,7 +51,7 @@ findbugs-jsr305 = "3.0.2" flink120 = { strictly = "1.20.1"} flink20 = { strictly = "2.0.0"} flink21 = { strictly = "2.1.0"} -google-libraries-bom = "26.79.0" +google-libraries-bom = "26.80.0" gcs-analytics-core = "1.2.3" guava = "33.6.0-jre" hadoop3 = "3.4.3" diff --git a/kafka-connect/kafka-connect-runtime/runtime-deps.txt b/kafka-connect/kafka-connect-runtime/runtime-deps.txt index 3da8c1fe8e3e..eb3e45769808 100644 --- a/kafka-connect/kafka-connect-runtime/runtime-deps.txt +++ b/kafka-connect/kafka-connect-runtime/runtime-deps.txt @@ -18,42 +18,42 @@ com.github.luben:zstd-jni:1.5.7-3 com.github.pjfanning:jersey-json:1.22.0 com.google.android:annotations:4.1.1.4 com.google.api-client:google-api-client:2.7.2 -com.google.api.grpc:gapic-google-cloud-storage-v2:2.64.1 -com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.24.0 -com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.196.0 -com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.196.0 -com.google.api.grpc:grpc-google-cloud-storage-v2:2.64.1 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.24.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.24.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.196.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.196.0 -com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.24.0 -com.google.api.grpc:proto-google-cloud-monitoring-v3:3.89.0 -com.google.api.grpc:proto-google-cloud-storage-v2:2.64.1 -com.google.api.grpc:proto-google-common-protos:2.67.0 -com.google.api.grpc:proto-google-iam-v1:1.62.0 -com.google.api:api-common:2.59.0 -com.google.api:gax-grpc:2.76.0 -com.google.api:gax-httpjson:2.76.0 -com.google.api:gax:2.76.0 +com.google.api.grpc:gapic-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:grpc-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:grpc-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1alpha:3.27.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta1:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta2:0.199.0 +com.google.api.grpc:proto-google-cloud-bigquerystorage-v1beta:3.27.0 +com.google.api.grpc:proto-google-cloud-monitoring-v3:3.92.0 +com.google.api.grpc:proto-google-cloud-storage-v2:2.67.0 +com.google.api.grpc:proto-google-common-protos:2.70.0 +com.google.api.grpc:proto-google-iam-v1:1.65.0 +com.google.api:api-common:2.62.0 +com.google.api:gax-grpc:2.79.0 +com.google.api:gax-httpjson:2.79.0 +com.google.api:gax:2.79.0 com.google.apis:google-api-services-bigquery:v2-rev20251012-2.0.0 com.google.apis:google-api-services-storage:v1-rev20260204-2.0.0 -com.google.auth:google-auth-library-credentials:1.43.0 -com.google.auth:google-auth-library-oauth2-http:1.43.0 +com.google.auth:google-auth-library-credentials:1.46.0 +com.google.auth:google-auth-library-oauth2-http:1.46.0 com.google.auto.value:auto-value-annotations:1.11.0 com.google.cloud.opentelemetry:detector-resources-support:0.33.0 com.google.cloud.opentelemetry:exporter-metrics:0.33.0 com.google.cloud.opentelemetry:shared-resourcemapping:0.33.0 -com.google.cloud:google-cloud-bigquery:2.62.0 -com.google.cloud:google-cloud-bigquerystorage:3.24.0 -com.google.cloud:google-cloud-core-grpc:2.66.0 -com.google.cloud:google-cloud-core-http:2.66.0 -com.google.cloud:google-cloud-core:2.66.0 -com.google.cloud:google-cloud-monitoring:3.89.0 -com.google.cloud:google-cloud-storage:2.64.1 +com.google.cloud:google-cloud-bigquery:2.65.0 +com.google.cloud:google-cloud-bigquerystorage:3.27.0 +com.google.cloud:google-cloud-core-grpc:2.69.0 +com.google.cloud:google-cloud-core-http:2.69.0 +com.google.cloud:google-cloud-core:2.69.0 +com.google.cloud:google-cloud-monitoring:3.92.0 +com.google.cloud:google-cloud-storage:2.67.0 com.google.code.findbugs:jsr305:3.0.2 com.google.code.gson:gson:2.12.1 -com.google.errorprone:error_prone_annotations:2.42.0 +com.google.errorprone:error_prone_annotations:2.45.0 com.google.flatbuffers:flatbuffers-java:24.3.25 com.google.guava:failureaccess:1.0.3 com.google.guava:guava:33.5.0-jre @@ -82,23 +82,23 @@ dev.failsafe:failsafe:3.3.2 dnsjava:dnsjava:3.6.1 io.airlift:aircompressor:2.0.3 io.dropwizard.metrics:metrics-core:3.2.4 -io.grpc:grpc-alts:1.76.3 -io.grpc:grpc-api:1.76.3 -io.grpc:grpc-auth:1.76.3 -io.grpc:grpc-context:1.76.3 -io.grpc:grpc-core:1.76.3 -io.grpc:grpc-googleapis:1.76.3 -io.grpc:grpc-grpclb:1.76.3 -io.grpc:grpc-inprocess:1.76.3 +io.grpc:grpc-alts:1.80.0 +io.grpc:grpc-api:1.80.0 +io.grpc:grpc-auth:1.80.0 +io.grpc:grpc-context:1.80.0 +io.grpc:grpc-core:1.80.0 +io.grpc:grpc-googleapis:1.80.0 +io.grpc:grpc-grpclb:1.80.0 +io.grpc:grpc-inprocess:1.80.0 io.grpc:grpc-netty-shaded:1.80.0 -io.grpc:grpc-opentelemetry:1.76.3 -io.grpc:grpc-protobuf-lite:1.76.3 -io.grpc:grpc-protobuf:1.76.3 -io.grpc:grpc-rls:1.76.3 -io.grpc:grpc-services:1.76.3 -io.grpc:grpc-stub:1.76.3 -io.grpc:grpc-util:1.76.3 -io.grpc:grpc-xds:1.76.3 +io.grpc:grpc-opentelemetry:1.80.0 +io.grpc:grpc-protobuf-lite:1.80.0 +io.grpc:grpc-protobuf:1.80.0 +io.grpc:grpc-rls:1.80.0 +io.grpc:grpc-services:1.80.0 +io.grpc:grpc-stub:1.80.0 +io.grpc:grpc-util:1.80.0 +io.grpc:grpc-xds:1.80.0 io.netty:netty-buffer:4.1.132.Final io.netty:netty-codec-dns:4.1.128.Final io.netty:netty-codec-http2:4.1.132.Final @@ -178,7 +178,7 @@ org.bouncycastle:bcprov-jdk18on:1.82 org.checkerframework:checker-compat-qual:2.5.6 org.checkerframework:checker-qual:3.49.0 org.codehaus.jettison:jettison:1.5.5 -org.codehaus.mojo:animal-sniffer-annotations:1.24 +org.codehaus.mojo:animal-sniffer-annotations:1.26 org.codehaus.woodstox:stax2-api:4.2.2 org.conscrypt:conscrypt-openjdk-uber:2.5.2 org.json:json:20250517 From c81534fba6a521f70dd1c054c4f688622c04c09d Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Wed, 29 Apr 2026 13:52:42 +0200 Subject: [PATCH 127/197] Flink: Backport: Bundle flink-metrics-dropwizard in runtime jar (#16141) * Flink: Backport: Bundle flink-metrics-dropwizard in runtime jar (#16126) --- flink/v1.20/build.gradle | 4 ++-- flink/v1.20/flink-runtime/LICENSE | 10 +++++++++- flink/v2.0/build.gradle | 4 ++-- flink/v2.0/flink-runtime/LICENSE | 10 +++++++++- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index 772133c8e1d8..2bbad1891c81 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -33,7 +33,7 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink120.avro - // for dropwizard histogram metrics implementation + // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink120.metrics.dropwizard compileOnly libs.flink120.streaming.java compileOnly "${libs.flink120.streaming.java.get().module}:${libs.flink120.streaming.java.get().getVersion()}:tests" @@ -169,7 +169,7 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // for dropwizard histogram metrics implementation + // To support dropwizard histogram metrics (not shipped by Flink by default) implementation libs.flink120.metrics.dropwizard // for integration testing with the flink-runtime-jar diff --git a/flink/v1.20/flink-runtime/LICENSE b/flink/v1.20/flink-runtime/LICENSE index 36a03cb4fcf9..e8c4c4a0bdf7 100644 --- a/flink/v1.20/flink-runtime/LICENSE +++ b/flink/v1.20/flink-runtime/LICENSE @@ -556,7 +556,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Codahale Metrics. +This product bundles Dropwizard Metrics. Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team Project URL: https://github.com/dropwizard/metrics @@ -564,6 +564,14 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Apache Flink's optional support for Dropwizard Metrics. + +Copyright: 2014-2026 The Apache Software Foundation +Project URL: https://flink.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors diff --git a/flink/v2.0/build.gradle b/flink/v2.0/build.gradle index b276cb90dd24..626cc01b28e3 100644 --- a/flink/v2.0/build.gradle +++ b/flink/v2.0/build.gradle @@ -33,7 +33,7 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink20.avro - // for dropwizard histogram metrics implementation + // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink20.metrics.dropwizard compileOnly libs.flink20.streaming.java compileOnly "${libs.flink20.streaming.java.get().module}:${libs.flink20.streaming.java.get().getVersion()}:tests" @@ -169,7 +169,7 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // for dropwizard histogram metrics implementation + // To support dropwizard histogram metrics (not shipped by Flink by default) implementation libs.flink20.metrics.dropwizard // for integration testing with the flink-runtime-jar diff --git a/flink/v2.0/flink-runtime/LICENSE b/flink/v2.0/flink-runtime/LICENSE index 36a03cb4fcf9..e8c4c4a0bdf7 100644 --- a/flink/v2.0/flink-runtime/LICENSE +++ b/flink/v2.0/flink-runtime/LICENSE @@ -556,7 +556,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Codahale Metrics. +This product bundles Dropwizard Metrics. Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team Project URL: https://github.com/dropwizard/metrics @@ -564,6 +564,14 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Apache Flink's optional support for Dropwizard Metrics. + +Copyright: 2014-2026 The Apache Software Foundation +Project URL: https://flink.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors From f0ba022fc34ef60bbb9199ef27d29c5f34543bf2 Mon Sep 17 00:00:00 2001 From: Ruijing Li Date: Wed, 29 Apr 2026 07:43:50 -0700 Subject: [PATCH 128/197] Spark 3.5: Backport Async Micro Batch Planner to 3.5 (#15992) --- .../apache/iceberg/spark/SparkReadConf.java | 33 ++ .../iceberg/spark/SparkReadOptions.java | 15 + .../iceberg/spark/SparkSQLProperties.java | 5 + .../source/AsyncSparkMicroBatchPlanner.java | 543 ++++++++++++++++++ .../source/BaseSparkMicroBatchPlanner.java | 151 +++++ .../iceberg/spark/source/MicroBatchUtils.java | 69 +++ .../spark/source/SparkMicroBatchPlanner.java | 47 ++ .../spark/source/SparkMicroBatchStream.java | 353 +----------- .../source/SyncSparkMicroBatchPlanner.java | 249 ++++++++ .../TestAsyncSparkMicroBatchPlanner.java | 61 ++ .../source/TestMicroBatchPlanningUtils.java | 100 ++++ .../source/TestStructuredStreamingRead3.java | 283 ++++++++- 12 files changed, 1581 insertions(+), 328 deletions(-) create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java create mode 100644 spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java create mode 100644 spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java create mode 100644 spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index b38c041507bb..61b1db160457 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -261,6 +261,39 @@ public int maxRecordsPerMicroBatch() { .parse(); } + public boolean asyncMicroBatchPlanningEnabled() { + return confParser + .booleanConf() + .option(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .sessionConf(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .defaultValue(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT) + .parse(); + } + + public long streamingSnapshotPollingIntervalMs() { + return confParser + .longConf() + .option(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS) + .defaultValue(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadFileLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadRowLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT) + .parse(); + } + public boolean preserveDataGrouping() { return confParser .booleanConf() diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index 17f2bfee69b8..5262310e2c5e 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -87,6 +87,21 @@ private SparkReadOptions() {} public static final String STREAMING_MAX_ROWS_PER_MICRO_BATCH = "streaming-max-rows-per-micro-batch"; + // Enable async micro batch planning + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "async-micro-batch-planning-enabled"; + + // Polling interval for async planner to refresh table metadata (ms) + public static final String STREAMING_SNAPSHOT_POLLING_INTERVAL_MS = + "streaming-snapshot-polling-interval-ms"; + public static final long STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT = 30000L; + + // Initial queue preload limits for async micro batch planner + public static final String ASYNC_QUEUE_PRELOAD_FILE_LIMIT = "async-queue-preload-file-limit"; + public static final long ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT = 100L; + public static final String ASYNC_QUEUE_PRELOAD_ROW_LIMIT = "async-queue-preload-row-limit"; + public static final long ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT = 100000L; + // Table path public static final String PATH = "path"; diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index e3ee288affbe..74adb0bc95da 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -103,4 +103,9 @@ private SparkSQLProperties() {} // Controls whether to report available column statistics to Spark for query optimization. public static final String REPORT_COLUMN_STATS = "spark.sql.iceberg.report-column-stats"; public static final boolean REPORT_COLUMN_STATS_DEFAULT = true; + + // Controls whether to enable async micro batch planning for session + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "spark.sql.iceberg.async-micro-batch-planning-enabled"; + public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false; } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..3e442f9917d4 --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java @@ -0,0 +1,543 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.spark.sql.connector.read.streaming.ReadAllAvailable; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements AutoCloseable { + private static final Logger LOG = LoggerFactory.getLogger(AsyncSparkMicroBatchPlanner.class); + private static final int PLAN_FILES_CACHE_MAX_SIZE = 10; + private static final long QUEUE_POLL_TIMEOUT_MS = 100L; // 100 ms + + private final long minQueuedFiles; + private final long minQueuedRows; + + // Cache for planFiles results to handle duplicate calls + private final Cache, List> planFilesCache; + + // Queue to buffer pre-fetched file scan tasks + private final LinkedBlockingDeque> queue; + + // Background executor for async operations + private final ScheduledExecutorService executor; + + // Error tracking + private volatile Throwable refreshFailedThrowable; + private volatile Throwable fillQueueFailedThrowable; + + // Tracking queue state + private final AtomicLong queuedFileCount = new AtomicLong(0); + private final AtomicLong queuedRowCount = new AtomicLong(0); + private Snapshot lastQueuedSnapshot; + private boolean stopped; + + // Cap for Trigger.AvailableNow - don't process beyond this offset + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + /** + * This class manages a queue of FileScanTask + StreamingOffset. On creation, it starts up an + * asynchronous polling process which populates the queue when a new snapshot arrives or the + * minimum amount of queued data is too low. + * + *

    Note: this will capture the state of the table when snapshots are added to the queue. If a + * snapshot is expired after being added to the queue, the job will still process it. + */ + AsyncSparkMicroBatchPlanner( + Table table, + SparkReadConf readConf, + StreamingOffset initialOffset, + StreamingOffset maybeEndOffset, + StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.minQueuedFiles = readConf().maxFilesPerMicroBatch(); + this.minQueuedRows = readConf().maxRecordsPerMicroBatch(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + this.planFilesCache = Caffeine.newBuilder().maximumSize(PLAN_FILES_CACHE_MAX_SIZE).build(); + this.queue = new LinkedBlockingDeque<>(); + + table().refresh(); + + // Synchronously add data to the queue to meet our initial constraints. + // For Trigger.AvailableNow, constructor-time preload is normally initialized from + // latestOffset(...) with no explicit end offset, so bounded preload must stop at + // Trigger.AvailableNow snapshot. + fillQueue(initialOffset, maybeEndOffset); + + this.executor = + Executors.newSingleThreadScheduledExecutor( + r -> { + Thread thread = new Thread(r, "iceberg-async-planner-" + table().name()); + thread.setDaemon(true); + return thread; + }); + // Schedule table refresh at configured interval + long pollingIntervalMs = readConf().streamingSnapshotPollingIntervalMs(); + this.executor.scheduleWithFixedDelay( + this::refreshAndTrapException, pollingIntervalMs, pollingIntervalMs, TimeUnit.MILLISECONDS); + // Schedule queue fill to run frequently (use polling interval for tests, cap at 100ms for + // production) + long queueFillIntervalMs = Math.min(QUEUE_POLL_TIMEOUT_MS, pollingIntervalMs); + executor.scheduleWithFixedDelay( + () -> fillQueueAndTrapException(lastQueuedSnapshot), + 0, + queueFillIntervalMs, + TimeUnit.MILLISECONDS); + + LOG.info( + "Started AsyncSparkMicroBatchPlanner for {} from initialOffset: {}", + table().name(), + initialOffset); + } + + @Override + public synchronized void stop() { + Preconditions.checkArgument( + !stopped, "AsyncSparkMicroBatchPlanner for {} was already stopped", table().name()); + stopped = true; + LOG.info("Stopping AsyncSparkMicroBatchPlanner for table: {}", table().name()); + executor.shutdownNow(); + boolean terminated = false; + try { + terminated = + executor.awaitTermination( + readConf().streamingSnapshotPollingIntervalMs() * 2, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + // Restore interrupt status + Thread.currentThread().interrupt(); + } + LOG.info("AsyncSparkMicroBatchPlanner for table: {}, stopped: {}", table().name(), terminated); + } + + @Override + public void close() { + stop(); + } + + /** + * Spark can call this multiple times; it should produce the same answer every time. + * + * @param startOffset the starting offset of this microbatch, position is inclusive + * @param endOffset the end offset of this microbatch, position is exclusive + * @return the list of files to scan between these offsets + */ + @Override + public synchronized List planFiles( + StreamingOffset startOffset, StreamingOffset endOffset) { + return planFilesCache.get( + Pair.of(startOffset, endOffset), + key -> { + LOG.info( + "running planFiles for {}, startOffset: {}, endOffset: {}", + table().name(), + startOffset, + endOffset); + List result = new LinkedList<>(); + Pair elem; + StreamingOffset currentOffset; + boolean shouldTerminate = false; + long filesInPlan = 0; + long rowsInPlan = 0; + + do { + try { + elem = queue.pollFirst(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while polling queue", e); + } + + if (elem != null) { + currentOffset = elem.first(); + LOG.debug("planFiles consumed: {}", currentOffset); + FileScanTask currentTask = elem.second(); + filesInPlan += 1; + long elemRows = currentTask.file().recordCount(); + rowsInPlan += elemRows; + queuedFileCount.decrementAndGet(); + queuedRowCount.addAndGet(-elemRows); + result.add(currentTask); + + // try to peek at the next entry of the queue and see if we should stop + Pair nextElem = queue.peekFirst(); + boolean endOffsetPeek = false; + if (nextElem != null) { + endOffsetPeek = endOffset.equals(nextElem.first()); + } + // end offset may be synthetic and not exist in the queue + boolean endOffsetSynthetic = + currentOffset.snapshotId() == endOffset.snapshotId() + && (currentOffset.position() + 1) == endOffset.position(); + shouldTerminate = endOffsetPeek || endOffsetSynthetic; + } else { + LOG.trace("planFiles hasn't reached {}, waiting", endOffset); + } + } while (!shouldTerminate + && refreshFailedThrowable == null + && fillQueueFailedThrowable == null); + + if (refreshFailedThrowable != null) { + throw new RuntimeException("Table refresh failed", refreshFailedThrowable); + } + + if (fillQueueFailedThrowable != null) { + throw new RuntimeException("Queue filling failed", fillQueueFailedThrowable); + } + + LOG.info( + "completed planFiles for {}, startOffset: {}, endOffset: {}, files: {}, rows: {}", + table().name(), + startOffset, + endOffset, + filesInPlan, + rowsInPlan); + return result; + }); + } + + /** + * This needs to be non destructive on the queue as spark could call this multiple times. Each + * time, depending on the table state it could return something different + * + * @param startOffset the starting offset of the next microbatch + * @param limit a limit for how many files/bytes/rows the next microbatch should include + * @return The end offset to use for the next microbatch, null signals that no data is available + */ + @Override + public synchronized StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + LOG.info( + "running latestOffset for {}, startOffset: {}, limit: {}", + table().name(), + startOffset, + limit); + + if (table().currentSnapshot() == null) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() is null"); + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < readConf().streamFromTimestamp()) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() < fromTimestamp"); + return StreamingOffset.START_OFFSET; + } + + // if any exceptions were encountered in the background process, raise them here + if (refreshFailedThrowable != null) { + throw new RuntimeException(refreshFailedThrowable); + } + if (fillQueueFailedThrowable != null) { + throw new RuntimeException(fillQueueFailedThrowable); + } + + // if we want to read all available we don't need to scan files, just snapshots + if (limit instanceof ReadAllAvailable) { + // If Trigger.AvailableNow cap is set, return it directly + if (this.lastOffsetForTriggerAvailableNow != null) { + return this.lastOffsetForTriggerAvailableNow; + } + Snapshot lastValidSnapshot = table().snapshot(startOffset.snapshotId()); + Snapshot nextValidSnapshot; + do { + nextValidSnapshot = nextValidSnapshot(lastValidSnapshot); + if (nextValidSnapshot != null) { + lastValidSnapshot = nextValidSnapshot; + } + } while (nextValidSnapshot != null); + return new StreamingOffset( + lastValidSnapshot.snapshotId(), + MicroBatchUtils.addedFilesCount(table(), lastValidSnapshot), + false); + } + + return computeLimitedOffset(limit); + } + + private StreamingOffset computeLimitedOffset(ReadLimit limit) { + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long rowsSeen = 0; + long filesSeen = 0; + LOG.debug( + "latestOffset queue status, queuedFiles: {}, queuedRows: {}", + queuedFileCount.get(), + queuedRowCount.get()); + + List> queueSnapshot = Lists.newArrayList(queue); + Pair queueTail = + queueSnapshot.isEmpty() ? null : queueSnapshot.get(queueSnapshot.size() - 1); + + for (int i = 0; i < queueSnapshot.size(); i++) { + Pair elem = queueSnapshot.get(i); + long fileRows = elem.second().file().recordCount(); + + // Hard limit on files - stop BEFORE exceeding + if (filesSeen + 1 > unpackedLimits.getMaxFiles()) { + if (filesSeen == 0) { + return null; + } + LOG.debug( + "latestOffset hit file limit at {}, rows: {}, files: {}", + elem.first(), + rowsSeen, + filesSeen); + return elem.first(); + } + + // Soft limit on rows - include file FIRST, then check + rowsSeen += fileRows; + filesSeen += 1; + + // Check if we've hit the row limit after including this file + if (rowsSeen >= unpackedLimits.getMaxRows()) { + if (filesSeen == 1 && rowsSeen > unpackedLimits.getMaxRows()) { + LOG.warn( + "File {} at offset {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + elem.second().file().location(), + elem.first(), + fileRows, + unpackedLimits.getMaxRows()); + } + // Return the offset of the NEXT element (or synthesize tail+1) + if (i + 1 < queueSnapshot.size()) { + LOG.debug( + "latestOffset hit row limit at {}, rows: {}, files: {}", + queueSnapshot.get(i + 1).first(), + rowsSeen, + filesSeen); + return queueSnapshot.get(i + 1).first(); + } else { + // This is the last element - return tail+1 + StreamingOffset current = elem.first(); + StreamingOffset result = + new StreamingOffset( + current.snapshotId(), current.position() + 1, current.shouldScanAllFiles()); + LOG.debug( + "latestOffset hit row limit at tail {}, rows: {}, files: {}", + result, + rowsSeen, + filesSeen); + return result; + } + } + } + + // if we got here there aren't enough files to exceed our limits + if (queueTail != null) { + StreamingOffset tailOffset = queueTail.first(); + // we have to increment the position by 1 since we want to include the tail in the read and + // position is non-inclusive + StreamingOffset latestOffset = + new StreamingOffset( + tailOffset.snapshotId(), tailOffset.position() + 1, tailOffset.shouldScanAllFiles()); + LOG.debug("latestOffset returning all queued data {}", latestOffset); + return latestOffset; + } + + // if we got here the queue is empty + LOG.debug("latestOffset no data, returning null"); + return null; + } + + // Background task wrapper that traps exceptions + private void refreshAndTrapException() { + try { + table().refresh(); + } catch (Throwable t) { + LOG.error("Failed to refresh table {}", table().name(), t); + refreshFailedThrowable = t; + } + } + + // Background task wrapper that traps exceptions + private void fillQueueAndTrapException(Snapshot snapshot) { + try { + fillQueue(snapshot); + } catch (Throwable t) { + LOG.error("Failed to fill queue for table {}", table().name(), t); + fillQueueFailedThrowable = t; + } + } + + /** Generate a MicroBatch based on input parameters and add to the queue */ + private void addMicroBatchToQueue( + Snapshot snapshot, long startFileIndex, long endFileIndex, boolean shouldScanAllFile) { + LOG.info("Adding MicroBatch for snapshot: {} to the queue", snapshot.snapshotId()); + MicroBatches.MicroBatch microBatch = + MicroBatches.from(snapshot, table().io()) + .caseSensitive(readConf().caseSensitive()) + .specsById(table().specs()) + .generate(startFileIndex, endFileIndex, Long.MAX_VALUE, shouldScanAllFile); + + long position = startFileIndex; + for (FileScanTask task : microBatch.tasks()) { + Pair elem = + Pair.of(new StreamingOffset(microBatch.snapshotId(), position, shouldScanAllFile), task); + queuedFileCount.incrementAndGet(); + queuedRowCount.addAndGet(task.file().recordCount()); + queue.addLast(elem); + position += 1; + } + if (LOG.isDebugEnabled()) { + StringBuilder sb = new StringBuilder("\n"); + for (Pair elem : queue) { + sb.append(elem.first()).append("\n"); + } + LOG.debug(sb.toString()); + } + lastQueuedSnapshot = snapshot; + } + + private void fillQueue(StreamingOffset fromOffset, StreamingOffset toOffset) { + LOG.debug("filling queue from {}, to: {}", fromOffset, toOffset); + Snapshot currentSnapshot = table().snapshot(fromOffset.snapshotId()); + // this could be a partial snapshot so add it outside the loop + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + fromOffset.position(), + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + fromOffset.shouldScanAllFiles()); + } + if (toOffset != null) { + if (currentSnapshot != null) { + while (currentSnapshot.snapshotId() != toOffset.snapshotId()) { + currentSnapshot = nextValidSnapshot(currentSnapshot); + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + false); + } else { + break; + } + } + } + // toOffset snapshot already added in loop when currentSnapshot == toOffset + } else { + fillQueueInitialBuffer(currentSnapshot); + } + } + + private void fillQueueInitialBuffer(Snapshot startSnapshot) { + // toOffset is null - fill initial buffer to prevent queue starvation before background + // thread starts. Use configured limits to avoid loading all snapshots + // (which could cause OOM on tables with thousands of snapshots). + long targetRows = readConf().asyncQueuePreloadRowLimit(); + long targetFiles = readConf().asyncQueuePreloadFileLimit(); + + Snapshot preloadEndSnapshot = initialPreloadEndSnapshot(); + if (preloadEndSnapshot == null) { + return; // Empty table + } + + // START_OFFSET case: initialize using nextValidSnapshot which respects timestamp filtering + Snapshot current = startSnapshot; + if (current == null) { + current = nextValidSnapshot(null); + if (current != null) { + addMicroBatchToQueue(current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } + } + + // Continue loading more snapshots within safety limits + if (current != null) { + while ((queuedRowCount.get() < targetRows || queuedFileCount.get() < targetFiles) + && current.snapshotId() != preloadEndSnapshot.snapshotId()) { + current = nextValidSnapshot(current); + if (current != null) { + addMicroBatchToQueue( + current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } else { + break; + } + } + } + } + + private Snapshot initialPreloadEndSnapshot() { + if (lastOffsetForTriggerAvailableNow != null) { + return table().snapshot(lastOffsetForTriggerAvailableNow.snapshotId()); + } + + return table().currentSnapshot(); + } + + @VisibleForTesting + static boolean reachedAvailableNowCap( + Snapshot readFrom, StreamingOffset lastOffsetForTriggerAvailableNow) { + return lastOffsetForTriggerAvailableNow != null + && readFrom != null + && readFrom.snapshotId() == lastOffsetForTriggerAvailableNow.snapshotId(); + } + + /** Try to populate the queue with data from unread snapshots */ + private void fillQueue(Snapshot readFrom) { + // Don't add beyond cap for Trigger.AvailableNow + if (reachedAvailableNowCap(readFrom, lastOffsetForTriggerAvailableNow)) { + LOG.debug( + "Reached cap snapshot {}, not adding more", + this.lastOffsetForTriggerAvailableNow.snapshotId()); + return; + } + + if ((queuedRowCount.get() > minQueuedRows) || (queuedFileCount.get() > minQueuedFiles)) { + // we have enough data buffered, check back shortly + LOG.debug( + "Buffer is full, {} > {} or {} > {}", + queuedRowCount.get(), + minQueuedRows, + queuedFileCount.get(), + minQueuedFiles); + } else { + // add an entire snapshot to the queue + Snapshot nextValidSnapshot = nextValidSnapshot(readFrom); + if (nextValidSnapshot != null) { + addMicroBatchToQueue( + nextValidSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), nextValidSnapshot), + false); + } else { + LOG.debug("No snapshots ready to be read"); + } + } + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..9298c2bbdfcc --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.Locale; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; +import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +abstract class BaseSparkMicroBatchPlanner implements SparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(BaseSparkMicroBatchPlanner.class); + private final Table table; + private final SparkReadConf readConf; + + BaseSparkMicroBatchPlanner(Table table, SparkReadConf readConf) { + this.table = table; + this.readConf = readConf; + } + + protected Table table() { + return table; + } + + protected SparkReadConf readConf() { + return readConf; + } + + protected boolean shouldProcess(Snapshot snapshot) { + String op = snapshot.operation(); + switch (op) { + case DataOperations.APPEND: + return true; + case DataOperations.REPLACE: + return false; + case DataOperations.DELETE: + Preconditions.checkState( + readConf.streamingSkipDeleteSnapshots(), + "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + return false; + case DataOperations.OVERWRITE: + Preconditions.checkState( + readConf.streamingSkipOverwriteSnapshots(), + "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + return false; + default: + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + } + } + + /** + * Get the next snapshot skipping over rewrite and delete snapshots. Async must handle nulls. + * + * @param curSnapshot the current snapshot + * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all + * remaining snapshots should be skipped. + */ + protected Snapshot nextValidSnapshot(Snapshot curSnapshot) { + Snapshot nextSnapshot; + // if there were no valid snapshots, check for an initialOffset again + if (curSnapshot == null) { + StreamingOffset startingOffset = + MicroBatchUtils.determineStartingOffset(table, readConf.streamFromTimestamp()); + LOG.debug("determineStartingOffset picked startingOffset: {}", startingOffset); + if (StreamingOffset.START_OFFSET.equals(startingOffset)) { + return null; + } + nextSnapshot = table.snapshot(startingOffset.snapshotId()); + } else { + if (curSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); + } + // skip over rewrite and delete snapshots + while (!shouldProcess(nextSnapshot)) { + LOG.debug("Skipping snapshot: {}", nextSnapshot); + // if the currentSnapShot was also the mostRecentSnapshot then break + // avoids snapshotAfter throwing exception since there are no more snapshots to process + if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); + } + return nextSnapshot; + } + + static class UnpackedLimits { + private long maxRows = Integer.MAX_VALUE; + private long maxFiles = Integer.MAX_VALUE; + + UnpackedLimits(ReadLimit limit) { + if (limit instanceof CompositeReadLimit) { + ReadLimit[] compositeLimits = ((CompositeReadLimit) limit).getReadLimits(); + for (ReadLimit individualLimit : compositeLimits) { + if (individualLimit instanceof ReadMaxRows) { + ReadMaxRows readMaxRows = (ReadMaxRows) individualLimit; + this.maxRows = Math.min(this.maxRows, readMaxRows.maxRows()); + } else if (individualLimit instanceof ReadMaxFiles) { + ReadMaxFiles readMaxFiles = (ReadMaxFiles) individualLimit; + this.maxFiles = Math.min(this.maxFiles, readMaxFiles.maxFiles()); + } + } + } else if (limit instanceof ReadMaxRows) { + this.maxRows = ((ReadMaxRows) limit).maxRows(); + } else if (limit instanceof ReadMaxFiles) { + this.maxFiles = ((ReadMaxFiles) limit).maxFiles(); + } + } + + public long getMaxRows() { + return maxRows; + } + + public long getMaxFiles() { + return maxFiles; + } + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java new file mode 100644 index 000000000000..7c73e3f416e3 --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotChanges; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.SnapshotUtil; + +class MicroBatchUtils { + + private MicroBatchUtils() {} + + static StreamingOffset determineStartingOffset(Table table, long fromTimestamp) { + if (table.currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (fromTimestamp == Long.MIN_VALUE) { + // start from the oldest snapshot, since default value is MIN_VALUE + // avoids looping to find first snapshot + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + + if (table.currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + try { + Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); + if (snapshot != null) { + return new StreamingOffset(snapshot.snapshotId(), 0, false); + } else { + return StreamingOffset.START_OFFSET; + } + } catch (IllegalStateException e) { + // could not determine the first snapshot after the timestamp. use the oldest ancestor instead + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + } + + static long addedFilesCount(Table table, Snapshot snapshot) { + long addedFilesCount = + PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); + return addedFilesCount == -1 + ? Iterables.size( + SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) + : addedFilesCount; + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java new file mode 100644 index 000000000000..1986ddac5d8e --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.List; +import org.apache.iceberg.FileScanTask; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; + +interface SparkMicroBatchPlanner { + /** + * Return the {@link FileScanTask}s for data added between the start and end offsets. + * + * @param startOffset the offset to start planning from + * @param endOffset the offset to plan up to + * @return file scan tasks for data in the offset range + */ + List planFiles(StreamingOffset startOffset, StreamingOffset endOffset); + + /** + * Return the latest offset the stream can advance to from {@code startOffset}, respecting the + * given {@link ReadLimit}. + * + * @param startOffset the current offset of the stream + * @param limit the read limit bounding how far ahead to advance + * @return the latest available offset, or {@code null} if no new data is available + */ + StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit); + + /** Stop the planner and release any resources. */ + void stop(); +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index a82583747a64..a1ff767fe2a0 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -26,48 +26,32 @@ import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; import java.util.List; -import java.util.Locale; import java.util.function.Supplier; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataOperations; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.MicroBatches; -import org.apache.iceberg.MicroBatches.MicroBatch; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotChanges; -import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopFileIO; import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadConf; -import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.SnapshotUtil; import org.apache.iceberg.util.TableScanUtil; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReaderFactory; -import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; import org.apache.spark.sql.connector.read.streaming.MicroBatchStream; import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.connector.read.streaming.ReadLimit; -import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; -import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; import org.apache.spark.sql.connector.read.streaming.SupportsTriggerAvailableNow; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -79,6 +63,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final Table table; private final Supplier fileIO; + private final SparkReadConf readConf; private final String branch; private final boolean caseSensitive; private final String expectedSchema; @@ -89,12 +74,11 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final long splitOpenFileCost; private final boolean localityPreferred; private final StreamingOffset initialOffset; - private final boolean skipDelete; - private final boolean skipOverwrite; private final long fromTimestamp; private final int maxFilesPerMicroBatch; private final int maxRecordsPerMicroBatch; private final boolean cacheDeleteFilesOnExecutors; + private SparkMicroBatchPlanner planner; private StreamingOffset lastOffsetForTriggerAvailableNow; SparkMicroBatchStream( @@ -106,6 +90,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA String checkpointLocation) { this.table = table; this.fileIO = fileIO; + this.readConf = readConf; this.branch = readConf.branch(); this.caseSensitive = readConf.caseSensitive(); this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -124,9 +109,6 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA new InitialOffsetStore( table, checkpointLocation, fromTimestamp, sparkContext.hadoopConfiguration()); this.initialOffset = initialOffsetStore.initialOffset(); - - this.skipDelete = readConf.streamingSkipDeleteSnapshots(); - this.skipOverwrite = readConf.streamingSkipOverwriteSnapshots(); } @Override @@ -141,8 +123,8 @@ public Offset latestOffset() { } Snapshot latestSnapshot = table.currentSnapshot(); - - return new StreamingOffset(latestSnapshot.snapshotId(), addedFilesCount(latestSnapshot), false); + return new StreamingOffset( + latestSnapshot.snapshotId(), MicroBatchUtils.addedFilesCount(table, latestSnapshot), false); } @Override @@ -161,7 +143,11 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { StreamingOffset endOffset = (StreamingOffset) end; StreamingOffset startOffset = (StreamingOffset) start; - List fileScanTasks = planFiles(startOffset, endOffset); + if (planner == null) { + initializePlanner(startOffset, endOffset); + } + + List fileScanTasks = planner.planFiles(startOffset, endOffset); CloseableIterable splitTasks = TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); @@ -171,7 +157,6 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { String[][] locations = computePreferredLocations(combinedScanTasks); InputPartition[] partitions = new InputPartition[combinedScanTasks.size()]; - for (int index = 0; index < combinedScanTasks.size(); index++) { partitions[index] = new SparkInputPartition( @@ -214,318 +199,35 @@ public Offset deserializeOffset(String json) { public void commit(Offset end) {} @Override - public void stop() {} - - private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { - List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = - StreamingOffset.START_OFFSET.equals(startOffset) - ? determineStartingOffset(table, fromTimestamp) - : startOffset; - - StreamingOffset currentOffset = null; - - // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) - do { - long endFileIndex; - if (currentOffset == null) { - currentOffset = batchStartOffset; - } else { - Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table, currentOffset.snapshotId()); - // it may happen that we need to read this snapshot partially in case it's equal to - // endOffset. - if (currentOffset.snapshotId() != endOffset.snapshotId()) { - currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); - } else { - currentOffset = endOffset; - } - } - - Snapshot snapshot = table.snapshot(currentOffset.snapshotId()); - - validateCurrentSnapshotExists(snapshot, currentOffset); - - if (!shouldProcess(snapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table.name()); - continue; - } - - Snapshot currentSnapshot = table.snapshot(currentOffset.snapshotId()); - if (currentOffset.snapshotId() == endOffset.snapshotId()) { - endFileIndex = endOffset.position(); - } else { - endFileIndex = addedFilesCount(currentSnapshot); - } - - MicroBatch latestMicroBatch = - MicroBatches.from(currentSnapshot, table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate( - currentOffset.position(), - endFileIndex, - Long.MAX_VALUE, - currentOffset.shouldScanAllFiles()); - - fileScanTasks.addAll(latestMicroBatch.tasks()); - } while (currentOffset.snapshotId() != endOffset.snapshotId()); - - return fileScanTasks; - } - - private boolean shouldProcess(Snapshot snapshot) { - String op = snapshot.operation(); - switch (op) { - case DataOperations.APPEND: - return true; - case DataOperations.REPLACE: - return false; - case DataOperations.DELETE: - Preconditions.checkState( - skipDelete, - "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); - return false; - case DataOperations.OVERWRITE: - Preconditions.checkState( - skipOverwrite, - "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); - return false; - default: - throw new IllegalStateException( - String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); - } - } - - private static StreamingOffset determineStartingOffset(Table table, Long fromTimestamp) { - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (fromTimestamp == null) { - // match existing behavior and start from the oldest snapshot - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; - } - - try { - Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); - if (snapshot != null) { - return new StreamingOffset(snapshot.snapshotId(), 0, false); - } else { - return StreamingOffset.START_OFFSET; - } - } catch (IllegalStateException e) { - // could not determine the first snapshot after the timestamp. use the oldest ancestor instead - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + public void stop() { + if (planner != null) { + planner.stop(); } } - private static int getMaxFiles(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) readLimit).maxFiles(); - } - - if (readLimit instanceof CompositeReadLimit) { - // We do not expect a CompositeReadLimit to contain a nested CompositeReadLimit. - // In fact, it should only be a composite of two or more of ReadMinRows, ReadMaxRows and - // ReadMaxFiles, with no more than one of each. - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) limit).maxFiles(); - } - } - } - - // there is no ReadMaxFiles, so return the default - return Integer.MAX_VALUE; - } - - private static int getMaxRows(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) readLimit).maxRows(); - return Math.toIntExact(maxRows); - } - - if (readLimit instanceof CompositeReadLimit) { - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) limit).maxRows(); - return Math.toIntExact(maxRows); - } - } + private void initializePlanner(StreamingOffset startOffset, StreamingOffset endOffset) { + if (readConf.asyncMicroBatchPlanningEnabled()) { + this.planner = + new AsyncSparkMicroBatchPlanner( + table, readConf, startOffset, endOffset, lastOffsetForTriggerAvailableNow); + } else { + this.planner = + new SyncSparkMicroBatchPlanner(table, readConf, lastOffsetForTriggerAvailableNow); } - - // There is no ReadMaxRows, so return the default - return Integer.MAX_VALUE; } @Override - @SuppressWarnings("checkstyle:CyclomaticComplexity") public Offset latestOffset(Offset startOffset, ReadLimit limit) { - // calculate end offset get snapshotId from the startOffset Preconditions.checkArgument( startOffset instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", startOffset); - table.refresh(); - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; + if (planner == null) { + initializePlanner((StreamingOffset) startOffset, null); } - // end offset can expand to multiple snapshots - StreamingOffset startingOffset = (StreamingOffset) startOffset; - - if (startOffset.equals(StreamingOffset.START_OFFSET)) { - startingOffset = determineStartingOffset(table, fromTimestamp); - } - - Snapshot curSnapshot = table.snapshot(startingOffset.snapshotId()); - validateCurrentSnapshotExists(curSnapshot, startingOffset); - - // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. - long latestSnapshotId = - lastOffsetForTriggerAvailableNow != null - ? lastOffsetForTriggerAvailableNow.snapshotId() - : table.currentSnapshot().snapshotId(); - - int startPosOfSnapOffset = (int) startingOffset.position(); - - boolean scanAllFiles = startingOffset.shouldScanAllFiles(); - - boolean shouldContinueReading = true; - int curFilesAdded = 0; - long curRecordCount = 0; - int curPos = 0; - - // Note : we produce nextOffset with pos as non-inclusive - while (shouldContinueReading) { - // generate manifest index for the curSnapshot - List> indexedManifests = - MicroBatches.skippedManifestIndexesFromSnapshot( - table.io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); - // this is under assumption we will be able to add at-least 1 file in the new offset - for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { - // be rest assured curPos >= startFileIndex - curPos = indexedManifests.get(idx).second(); - try (CloseableIterable taskIterable = - MicroBatches.openManifestFile( - table.io(), - table.specs(), - caseSensitive, - curSnapshot, - indexedManifests.get(idx).first(), - scanAllFiles); - CloseableIterator taskIter = taskIterable.iterator()) { - while (taskIter.hasNext()) { - FileScanTask task = taskIter.next(); - if (curPos >= startPosOfSnapOffset) { - if ((curFilesAdded + 1) > getMaxFiles(limit)) { - // On including the file it might happen that we might exceed, the configured - // soft limit on the number of records, since this is a soft limit its acceptable. - shouldContinueReading = false; - break; - } - - curFilesAdded += 1; - curRecordCount += task.file().recordCount(); - - if (curRecordCount >= getMaxRows(limit)) { - // we included the file, so increment the number of files - // read in the current snapshot. - ++curPos; - shouldContinueReading = false; - break; - } - } - ++curPos; - } - } catch (IOException ioe) { - LOG.warn("Failed to close task iterable", ioe); - } - } - // if the currentSnapShot was also the latestSnapshot then break - if (curSnapshot.snapshotId() == latestSnapshotId) { - break; - } - - // if everything was OK and we consumed complete snapshot then move to next snapshot - if (shouldContinueReading) { - Snapshot nextValid = nextValidSnapshot(curSnapshot); - if (nextValid == null) { - // nextValid implies all the remaining snapshots should be skipped. - break; - } - // we found the next available snapshot, continue from there. - curSnapshot = nextValid; - startPosOfSnapOffset = -1; - // if anyhow we are moving to next snapshot we should only scan addedFiles - scanAllFiles = false; - } - } - - StreamingOffset latestStreamingOffset = - new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); - - // if no new data arrived, then return null. - return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; - } - - /** - * Get the next snapshot skiping over rewrite and delete snapshots. - * - * @param curSnapshot the current snapshot - * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all - * remaining snapshots should be skipped. - */ - private Snapshot nextValidSnapshot(Snapshot curSnapshot) { - Snapshot nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); - // skip over rewrite and delete snapshots - while (!shouldProcess(nextSnapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", nextSnapshot.snapshotId(), table.name()); - // if the currentSnapShot was also the mostRecentSnapshot then break - if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { - return null; - } - nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); - } - return nextSnapshot; - } - - private long addedFilesCount(Snapshot snapshot) { - long addedFilesCount = - PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); - // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, - // iterate through addedFiles iterator to find addedFilesCount. - return addedFilesCount == -1 - ? Iterables.size( - SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) - : addedFilesCount; - } - - private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { - if (snapshot == null) { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "Cannot load current offset at snapshot %d, the snapshot was expired or removed", - currentOffset.snapshotId())); - } + return planner.latestOffset((StreamingOffset) startOffset, limit); } @Override @@ -553,6 +255,11 @@ public void prepareForTriggerAvailableNow() { (StreamingOffset) latestOffset(initialOffset, ReadLimit.allAvailable()); LOG.info("lastOffset for Trigger.AvailableNow is {}", lastOffsetForTriggerAvailableNow.json()); + + if (planner != null) { + planner.stop(); + planner = null; + } } private static class InitialOffsetStore { @@ -576,7 +283,7 @@ public StreamingOffset initialOffset() { } table.refresh(); - StreamingOffset offset = determineStartingOffset(table, fromTimestamp); + StreamingOffset offset = MicroBatchUtils.determineStartingOffset(table, fromTimestamp); OutputFile outputFile = io.newOutputFile(initialOffsetLocation); writeOffset(offset, outputFile); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..f1b0029c5432 --- /dev/null +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.MicroBatches.MicroBatch; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class SyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(SyncSparkMicroBatchPlanner.class); + + private final boolean caseSensitive; + private final long fromTimestamp; + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + SyncSparkMicroBatchPlanner( + Table table, SparkReadConf readConf, StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.caseSensitive = readConf().caseSensitive(); + this.fromTimestamp = readConf().streamFromTimestamp(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + } + + @Override + public List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { + List fileScanTasks = Lists.newArrayList(); + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? MicroBatchUtils.determineStartingOffset(table(), fromTimestamp) + : startOffset; + + StreamingOffset currentOffset = null; + + // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) + do { + long endFileIndex; + if (currentOffset == null) { + currentOffset = batchStartOffset; + } else { + Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table(), currentOffset.snapshotId()); + // it may happen that we need to read this snapshot partially in case it's equal to + // endOffset. + if (currentOffset.snapshotId() != endOffset.snapshotId()) { + currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); + } else { + currentOffset = endOffset; + } + } + + Snapshot snapshot = table().snapshot(currentOffset.snapshotId()); + + validateCurrentSnapshotExists(snapshot, currentOffset); + + if (!shouldProcess(snapshot)) { + LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table().name()); + continue; + } + + Snapshot currentSnapshot = table().snapshot(currentOffset.snapshotId()); + if (currentOffset.snapshotId() == endOffset.snapshotId()) { + endFileIndex = endOffset.position(); + } else { + endFileIndex = MicroBatchUtils.addedFilesCount(table(), currentSnapshot); + } + + MicroBatch latestMicroBatch = + MicroBatches.from(currentSnapshot, table().io()) + .caseSensitive(caseSensitive) + .specsById(table().specs()) + .generate( + currentOffset.position(), + endFileIndex, + Long.MAX_VALUE, + currentOffset.shouldScanAllFiles()); + + fileScanTasks.addAll(latestMicroBatch.tasks()); + } while (currentOffset.snapshotId() != endOffset.snapshotId()); + + return fileScanTasks; + } + + @Override + @SuppressWarnings("checkstyle:CyclomaticComplexity") + public StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + table().refresh(); + if (table().currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + // end offset can expand to multiple snapshots + StreamingOffset startingOffset = startOffset; + + if (startOffset.equals(StreamingOffset.START_OFFSET)) { + startingOffset = MicroBatchUtils.determineStartingOffset(table(), fromTimestamp); + } + + Snapshot curSnapshot = table().snapshot(startingOffset.snapshotId()); + validateCurrentSnapshotExists(curSnapshot, startingOffset); + + // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. + long latestSnapshotId = + lastOffsetForTriggerAvailableNow != null + ? lastOffsetForTriggerAvailableNow.snapshotId() + : table().currentSnapshot().snapshotId(); + + int startPosOfSnapOffset = (int) startingOffset.position(); + + boolean scanAllFiles = startingOffset.shouldScanAllFiles(); + + boolean shouldContinueReading = true; + int curFilesAdded = 0; + long curRecordCount = 0; + int curPos = 0; + + // Extract limits once to avoid repeated calls in tight loop + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long maxFiles = unpackedLimits.getMaxFiles(); + long maxRows = unpackedLimits.getMaxRows(); + + // Note : we produce nextOffset with pos as non-inclusive + while (shouldContinueReading) { + // generate manifest index for the curSnapshot + List> indexedManifests = + MicroBatches.skippedManifestIndexesFromSnapshot( + table().io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); + // this is under assumption we will be able to add at-least 1 file in the new offset + for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { + // be rest assured curPos >= startFileIndex + curPos = indexedManifests.get(idx).second(); + try (CloseableIterable taskIterable = + MicroBatches.openManifestFile( + table().io(), + table().specs(), + caseSensitive, + curSnapshot, + indexedManifests.get(idx).first(), + scanAllFiles); + CloseableIterator taskIter = taskIterable.iterator()) { + while (taskIter.hasNext()) { + FileScanTask task = taskIter.next(); + if (curPos >= startPosOfSnapOffset) { + if ((curFilesAdded + 1) > maxFiles) { + // On including the file it might happen that we might exceed, the configured + // soft limit on the number of records, since this is a soft limit its acceptable. + shouldContinueReading = false; + break; + } + + curFilesAdded += 1; + curRecordCount += task.file().recordCount(); + + if (curRecordCount >= maxRows) { + // we included the file, so increment the number of files + // read in the current snapshot. + if (curFilesAdded == 1 && curRecordCount > maxRows) { + LOG.warn( + "File {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + task.file().location(), + task.file().recordCount(), + maxRows); + } + ++curPos; + shouldContinueReading = false; + break; + } + } + ++curPos; + } + } catch (IOException ioe) { + LOG.warn("Failed to close task iterable", ioe); + } + } + // if the currentSnapShot was also the latestSnapshot then break + if (curSnapshot.snapshotId() == latestSnapshotId) { + break; + } + + // if everything was OK and we consumed complete snapshot then move to next snapshot + if (shouldContinueReading) { + Snapshot nextValid = nextValidSnapshot(curSnapshot); + if (nextValid == null) { + // nextValid implies all the remaining snapshots should be skipped. + break; + } + // we found the next available snapshot, continue from there. + curSnapshot = nextValid; + startPosOfSnapOffset = -1; + // if anyhow we are moving to next snapshot we should only scan addedFiles + scanAllFiles = false; + } + } + + StreamingOffset latestStreamingOffset = + new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); + + // if no new data arrived, then return null. + return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; + } + + @Override + public void stop() {} + + private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { + if (snapshot == null) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Cannot load current offset at snapshot %d, the snapshot was expired or removed", + currentOffset.snapshotId())); + } + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..b6017e2001e7 --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.iceberg.Snapshot; +import org.junit.jupiter.api.Test; + +class TestAsyncSparkMicroBatchPlanner { + + @Test + void reachedAvailableNowCapReturnsTrueOnlyForExactCapSnapshot() { + Snapshot capSnapshot = mockSnapshot(10L); + Snapshot laterSnapshotWithHigherId = mockSnapshot(20L); + Snapshot laterSnapshotWithLowerId = mockSnapshot(5L); + StreamingOffset capOffset = new StreamingOffset(10L, 3L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(capSnapshot, capOffset)).isTrue(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap( + laterSnapshotWithHigherId, capOffset)) + .isFalse(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(laterSnapshotWithLowerId, capOffset)) + .isFalse(); + } + + @Test + void reachedAvailableNowCapReturnsFalseWhenCapOrSnapshotIsMissing() { + Snapshot readFrom = mockSnapshot(10L); + StreamingOffset capOffset = new StreamingOffset(10L, 1L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(readFrom, null)).isFalse(); + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(null, capOffset)).isFalse(); + } + + private Snapshot mockSnapshot(long snapshotId) { + Snapshot snapshot = mock(Snapshot.class); + when(snapshot.snapshotId()).thenReturn(snapshotId); + return snapshot; + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java new file mode 100644 index 000000000000..a9ce340fd4ec --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestMicroBatchPlanningUtils extends CatalogTestBase { + + private Table table; + + @BeforeEach + public void setupTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + sql( + "CREATE TABLE %s " + + "(id INT, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(3, id))", + tableName); + this.table = validationCatalog.loadTable(tableIdent); + } + + @AfterEach + public void dropTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @TestTemplate + public void testUnpackedLimitsCompositeChoosesMinimum() { + ReadLimit[] limits = + new ReadLimit[] { + ReadLimit.maxRows(10), ReadLimit.maxRows(4), ReadLimit.maxFiles(8), ReadLimit.maxFiles(2) + }; + + ReadLimit composite = ReadLimit.compositeLimit(limits); + + BaseSparkMicroBatchPlanner.UnpackedLimits unpacked = + new BaseSparkMicroBatchPlanner.UnpackedLimits(composite); + + assertThat(unpacked.getMaxRows()).isEqualTo(4); + assertThat(unpacked.getMaxFiles()).isEqualTo(2); + } + + @TestTemplate + public void testDetermineStartingOffsetWithTimestampBetweenSnapshots() { + sql("INSERT INTO %s VALUES (1, 'one')", tableName); + table.refresh(); + long snapshot1Time = table.currentSnapshot().timestampMillis(); + + sql("INSERT INTO %s VALUES (2, 'two')", tableName); + table.refresh(); + long snapshot2Id = table.currentSnapshot().snapshotId(); + + StreamingOffset offset = MicroBatchUtils.determineStartingOffset(table, snapshot1Time + 1); + + assertThat(offset.snapshotId()).isEqualTo(snapshot2Id); + assertThat(offset.position()).isEqualTo(0L); + assertThat(offset.shouldScanAllFiles()).isFalse(); + } + + @TestTemplate + public void testAddedFilesCountUsesSummaryWhenPresent() { + sql("INSERT INTO %s VALUES (1, 'one')", tableName); + table.refresh(); + + long expectedAddedFiles = + Long.parseLong(table.currentSnapshot().summary().get(SnapshotSummary.ADDED_FILES_PROP)); + + long actual = MicroBatchUtils.addedFilesCount(table, table.currentSnapshot()); + + assertThat(actual).isEqualTo(expectedAddedFiles); + } +} diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index 5f54c832aa93..d97e6ec00d7f 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -31,13 +31,17 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.DataOperations; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Files; +import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.RewriteFiles; import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; @@ -50,15 +54,22 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkReadConf; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.streaming.DataStreamWriter; import org.apache.spark.sql.streaming.OutputMode; @@ -73,10 +84,73 @@ @ExtendWith(ParameterizedTestExtension.class) public final class TestStructuredStreamingRead3 extends CatalogTestBase { + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}, async = {3}") + public static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + false + }, + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + true + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + false + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + true + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + false + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + true + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + false + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + true + } + }; + } + private Table table; private final AtomicInteger microBatches = new AtomicInteger(); + @Parameter(index = 3) + private Boolean async; + /** * test data to be used by multiple writes each write creates a snapshot and writes a list of * records @@ -250,15 +324,41 @@ public void testReadStreamWithCompositeReadLimit() throws Exception { Trigger.AvailableNow()); } + @TestTemplate + public void testReadStreamWithLowAsyncQueuePreload() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + // Set low preload limits to test async queue behavior - background thread should load + // remaining data + + StreamingQuery query = + startStream( + ImmutableMap.of( + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "5", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "5")); + + List actual = rowsAvailable(query); + assertThat(actual) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(TEST_DATA_MULTIPLE_SNAPSHOTS)); + } + @TestTemplate public void testAvailableNowStreamReadShouldNotHangOrReprocessData() throws Exception { File writerCheckpointFolder = temp.resolve("writer-checkpoint-folder").toFile(); File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.resolve("junit").toFile(); + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + DataStreamWriter querySource = spark .readStream() + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -313,10 +413,17 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex long expectedSnapshotId = table.currentSnapshot().snapshotId(); String sinkTable = "availablenow_sink"; + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + StreamingQuery query = spark .readStream() - .option(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1") + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -358,6 +465,142 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex assertThat(actualResults).containsExactlyInAnyOrderElementsOf(Iterables.concat(expectedData)); } + @TestTemplate + public void testTriggerAvailableNowCapsAsyncPreloadAfterPrepare() { + List> initialData = + List.of(List.of(new SimpleRecord(1, "one")), List.of(new SimpleRecord(2, "two"))); + appendDataAsMultipleSnapshots(initialData); + + table.refresh(); + long expectedCapSnapshotId = table.currentSnapshot().snapshotId(); + + SparkMicroBatchStream stream = + new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf( + spark, + table, + ImmutableMap.of( + SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, + async.toString(), + SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, + "1", + SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, + "1", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "10", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "10")), + table.schema(), + temp.resolve("available-now-cap-checkpoint").toString()); + + try { + stream.prepareForTriggerAvailableNow(); + + appendData(List.of(new SimpleRecord(3, "three"))); + + Offset startOffset = stream.initialOffset(); + Offset firstEndOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + assertThat(firstEndOffset).isNotNull(); + stream.planInputPartitions(startOffset, firstEndOffset); + + Offset secondEndOffset = stream.latestOffset(firstEndOffset, stream.getDefaultReadLimit()); + assertThat(secondEndOffset).isNotNull(); + stream.planInputPartitions(firstEndOffset, secondEndOffset); + + assertThat(stream.latestOffset(secondEndOffset, stream.getDefaultReadLimit())).isNull(); + assertThat(((StreamingOffset) secondEndOffset).snapshotId()).isEqualTo(expectedCapSnapshotId); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testLatestOffsetReturnsNullAfterFinalBatchIsConsumed() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + table.refresh(); + int expectedBatchCount; + try (CloseableIterable tasks = table.newScan().planFiles()) { + expectedBatchCount = Iterables.size(tasks); + } + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "drain-to-null-checkpoint"); + + try { + int plannedBatchCount = 0; + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + InputPartition[] partitions = stream.planInputPartitions(startOffset, endOffset); + assertThat(partitions).isNotEmpty(); + plannedBatchCount += 1; + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + assertThat(plannedBatchCount).isEqualTo(expectedBatchCount); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testPlanInputPartitionsIsIdempotentForSameOffsets() { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "idempotent-plan-files-checkpoint"); + + try { + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + + assertThat(endOffset).isNotNull(); + + InputPartition[] firstPartitions = stream.planInputPartitions(startOffset, endOffset); + InputPartition[] secondPartitions = stream.planInputPartitions(startOffset, endOffset); + + List firstFileLocations = Lists.newArrayList(); + for (InputPartition partition : firstPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + firstFileLocations.add(task.file().location()); + } + } + + List secondFileLocations = Lists.newArrayList(); + for (InputPartition partition : secondPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + secondFileLocations.add(task.file().location()); + } + } + + assertThat(firstFileLocations).containsExactlyInAnyOrderElementsOf(secondFileLocations); + + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + assertThat(stream.planInputPartitions(startOffset, endOffset)).isNotEmpty(); + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + } finally { + stream.stop(); + } + } + @TestTemplate public void testReadStreamOnIcebergThenAddData() throws Exception { List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; @@ -425,6 +668,8 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { // Data appended after the timestamp should appear appendData(data); + // Allow async background thread to refresh, else test sometimes fails + Thread.sleep(50); actual = rowsAvailable(query); assertThat(actual).containsExactlyInAnyOrderElementsOf(data); } @@ -872,13 +1117,18 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } return spark .readStream() - .options(options) + .options(allOptions) .format("iceberg") .load(tableName) .writeStream() - .options(options) + .options(allOptions) .format("memory") .queryName(MEMORY_TABLE) .outputMode(OutputMode.Append()) @@ -903,11 +1153,17 @@ private void assertMicroBatchRecordSizes( private void assertMicroBatchRecordSizes( Map options, List expectedMicroBatchRecordSize, Trigger trigger) throws TimeoutException { - Dataset ds = spark.readStream().options(options).format("iceberg").load(tableName); + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + Dataset ds = spark.readStream().options(allOptions).format("iceberg").load(tableName); List syncList = Collections.synchronizedList(Lists.newArrayList()); ds.writeStream() - .options(options) + .options(allOptions) .trigger(trigger) .foreachBatch( (VoidFunction2, Long>) @@ -929,4 +1185,21 @@ private List rowsAvailable(StreamingQuery query) { .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } + + private SparkMicroBatchStream newMicroBatchStream( + Map options, String checkpointDirName) { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + return new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf(spark, table, allOptions), + table.schema(), + temp.resolve(checkpointDirName).toString()); + } } From df00c156d41deddd0dc979f2867ad6f81cb0e246 Mon Sep 17 00:00:00 2001 From: Ruijing Li Date: Wed, 29 Apr 2026 07:44:17 -0700 Subject: [PATCH 129/197] Spark 4.0: Backport Aync Micro Batch Planner Feature (#15876) --- .../apache/iceberg/spark/SparkReadConf.java | 33 ++ .../iceberg/spark/SparkReadOptions.java | 15 + .../iceberg/spark/SparkSQLProperties.java | 5 + .../source/AsyncSparkMicroBatchPlanner.java | 543 ++++++++++++++++++ .../source/BaseSparkMicroBatchPlanner.java | 151 +++++ .../iceberg/spark/source/MicroBatchUtils.java | 69 +++ .../spark/source/SparkMicroBatchPlanner.java | 47 ++ .../spark/source/SparkMicroBatchStream.java | 353 +----------- .../source/SyncSparkMicroBatchPlanner.java | 249 ++++++++ .../TestAsyncSparkMicroBatchPlanner.java | 61 ++ .../source/TestMicroBatchPlanningUtils.java | 100 ++++ .../source/TestStructuredStreamingRead3.java | 286 ++++++++- 12 files changed, 1582 insertions(+), 330 deletions(-) create mode 100644 spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java create mode 100644 spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java create mode 100644 spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java create mode 100644 spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java create mode 100644 spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java create mode 100644 spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java create mode 100644 spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java index 9a19aa7d1e62..c5fe276aaecb 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java @@ -261,6 +261,39 @@ public int maxRecordsPerMicroBatch() { .parse(); } + public boolean asyncMicroBatchPlanningEnabled() { + return confParser + .booleanConf() + .option(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .sessionConf(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED) + .defaultValue(SparkSQLProperties.ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT) + .parse(); + } + + public long streamingSnapshotPollingIntervalMs() { + return confParser + .longConf() + .option(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS) + .defaultValue(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadFileLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT) + .parse(); + } + + public long asyncQueuePreloadRowLimit() { + return confParser + .longConf() + .option(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT) + .defaultValue(SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT) + .parse(); + } + public boolean preserveDataGrouping() { return confParser .booleanConf() diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java index 17f2bfee69b8..5262310e2c5e 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java @@ -87,6 +87,21 @@ private SparkReadOptions() {} public static final String STREAMING_MAX_ROWS_PER_MICRO_BATCH = "streaming-max-rows-per-micro-batch"; + // Enable async micro batch planning + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "async-micro-batch-planning-enabled"; + + // Polling interval for async planner to refresh table metadata (ms) + public static final String STREAMING_SNAPSHOT_POLLING_INTERVAL_MS = + "streaming-snapshot-polling-interval-ms"; + public static final long STREAMING_SNAPSHOT_POLLING_INTERVAL_MS_DEFAULT = 30000L; + + // Initial queue preload limits for async micro batch planner + public static final String ASYNC_QUEUE_PRELOAD_FILE_LIMIT = "async-queue-preload-file-limit"; + public static final long ASYNC_QUEUE_PRELOAD_FILE_LIMIT_DEFAULT = 100L; + public static final String ASYNC_QUEUE_PRELOAD_ROW_LIMIT = "async-queue-preload-row-limit"; + public static final long ASYNC_QUEUE_PRELOAD_ROW_LIMIT_DEFAULT = 100000L; + // Table path public static final String PATH = "path"; diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index 735ee4efbc35..b5b860214564 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -104,6 +104,11 @@ private SparkSQLProperties() {} public static final String REPORT_COLUMN_STATS = "spark.sql.iceberg.report-column-stats"; public static final boolean REPORT_COLUMN_STATS_DEFAULT = true; + // Controls whether to enable async micro batch planning for session + public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = + "spark.sql.iceberg.async-micro-batch-planning-enabled"; + public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false; + // Prefix for custom snapshot properties public static final String SNAPSHOT_PROPERTY_PREFIX = "spark.sql.iceberg.snapshot-property."; } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..3e442f9917d4 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/AsyncSparkMicroBatchPlanner.java @@ -0,0 +1,543 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; +import java.util.LinkedList; +import java.util.List; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.spark.sql.connector.read.streaming.ReadAllAvailable; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class AsyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner implements AutoCloseable { + private static final Logger LOG = LoggerFactory.getLogger(AsyncSparkMicroBatchPlanner.class); + private static final int PLAN_FILES_CACHE_MAX_SIZE = 10; + private static final long QUEUE_POLL_TIMEOUT_MS = 100L; // 100 ms + + private final long minQueuedFiles; + private final long minQueuedRows; + + // Cache for planFiles results to handle duplicate calls + private final Cache, List> planFilesCache; + + // Queue to buffer pre-fetched file scan tasks + private final LinkedBlockingDeque> queue; + + // Background executor for async operations + private final ScheduledExecutorService executor; + + // Error tracking + private volatile Throwable refreshFailedThrowable; + private volatile Throwable fillQueueFailedThrowable; + + // Tracking queue state + private final AtomicLong queuedFileCount = new AtomicLong(0); + private final AtomicLong queuedRowCount = new AtomicLong(0); + private Snapshot lastQueuedSnapshot; + private boolean stopped; + + // Cap for Trigger.AvailableNow - don't process beyond this offset + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + /** + * This class manages a queue of FileScanTask + StreamingOffset. On creation, it starts up an + * asynchronous polling process which populates the queue when a new snapshot arrives or the + * minimum amount of queued data is too low. + * + *

    Note: this will capture the state of the table when snapshots are added to the queue. If a + * snapshot is expired after being added to the queue, the job will still process it. + */ + AsyncSparkMicroBatchPlanner( + Table table, + SparkReadConf readConf, + StreamingOffset initialOffset, + StreamingOffset maybeEndOffset, + StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.minQueuedFiles = readConf().maxFilesPerMicroBatch(); + this.minQueuedRows = readConf().maxRecordsPerMicroBatch(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + this.planFilesCache = Caffeine.newBuilder().maximumSize(PLAN_FILES_CACHE_MAX_SIZE).build(); + this.queue = new LinkedBlockingDeque<>(); + + table().refresh(); + + // Synchronously add data to the queue to meet our initial constraints. + // For Trigger.AvailableNow, constructor-time preload is normally initialized from + // latestOffset(...) with no explicit end offset, so bounded preload must stop at + // Trigger.AvailableNow snapshot. + fillQueue(initialOffset, maybeEndOffset); + + this.executor = + Executors.newSingleThreadScheduledExecutor( + r -> { + Thread thread = new Thread(r, "iceberg-async-planner-" + table().name()); + thread.setDaemon(true); + return thread; + }); + // Schedule table refresh at configured interval + long pollingIntervalMs = readConf().streamingSnapshotPollingIntervalMs(); + this.executor.scheduleWithFixedDelay( + this::refreshAndTrapException, pollingIntervalMs, pollingIntervalMs, TimeUnit.MILLISECONDS); + // Schedule queue fill to run frequently (use polling interval for tests, cap at 100ms for + // production) + long queueFillIntervalMs = Math.min(QUEUE_POLL_TIMEOUT_MS, pollingIntervalMs); + executor.scheduleWithFixedDelay( + () -> fillQueueAndTrapException(lastQueuedSnapshot), + 0, + queueFillIntervalMs, + TimeUnit.MILLISECONDS); + + LOG.info( + "Started AsyncSparkMicroBatchPlanner for {} from initialOffset: {}", + table().name(), + initialOffset); + } + + @Override + public synchronized void stop() { + Preconditions.checkArgument( + !stopped, "AsyncSparkMicroBatchPlanner for {} was already stopped", table().name()); + stopped = true; + LOG.info("Stopping AsyncSparkMicroBatchPlanner for table: {}", table().name()); + executor.shutdownNow(); + boolean terminated = false; + try { + terminated = + executor.awaitTermination( + readConf().streamingSnapshotPollingIntervalMs() * 2, TimeUnit.MILLISECONDS); + } catch (InterruptedException ignored) { + // Restore interrupt status + Thread.currentThread().interrupt(); + } + LOG.info("AsyncSparkMicroBatchPlanner for table: {}, stopped: {}", table().name(), terminated); + } + + @Override + public void close() { + stop(); + } + + /** + * Spark can call this multiple times; it should produce the same answer every time. + * + * @param startOffset the starting offset of this microbatch, position is inclusive + * @param endOffset the end offset of this microbatch, position is exclusive + * @return the list of files to scan between these offsets + */ + @Override + public synchronized List planFiles( + StreamingOffset startOffset, StreamingOffset endOffset) { + return planFilesCache.get( + Pair.of(startOffset, endOffset), + key -> { + LOG.info( + "running planFiles for {}, startOffset: {}, endOffset: {}", + table().name(), + startOffset, + endOffset); + List result = new LinkedList<>(); + Pair elem; + StreamingOffset currentOffset; + boolean shouldTerminate = false; + long filesInPlan = 0; + long rowsInPlan = 0; + + do { + try { + elem = queue.pollFirst(QUEUE_POLL_TIMEOUT_MS, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new RuntimeException("Interrupted while polling queue", e); + } + + if (elem != null) { + currentOffset = elem.first(); + LOG.debug("planFiles consumed: {}", currentOffset); + FileScanTask currentTask = elem.second(); + filesInPlan += 1; + long elemRows = currentTask.file().recordCount(); + rowsInPlan += elemRows; + queuedFileCount.decrementAndGet(); + queuedRowCount.addAndGet(-elemRows); + result.add(currentTask); + + // try to peek at the next entry of the queue and see if we should stop + Pair nextElem = queue.peekFirst(); + boolean endOffsetPeek = false; + if (nextElem != null) { + endOffsetPeek = endOffset.equals(nextElem.first()); + } + // end offset may be synthetic and not exist in the queue + boolean endOffsetSynthetic = + currentOffset.snapshotId() == endOffset.snapshotId() + && (currentOffset.position() + 1) == endOffset.position(); + shouldTerminate = endOffsetPeek || endOffsetSynthetic; + } else { + LOG.trace("planFiles hasn't reached {}, waiting", endOffset); + } + } while (!shouldTerminate + && refreshFailedThrowable == null + && fillQueueFailedThrowable == null); + + if (refreshFailedThrowable != null) { + throw new RuntimeException("Table refresh failed", refreshFailedThrowable); + } + + if (fillQueueFailedThrowable != null) { + throw new RuntimeException("Queue filling failed", fillQueueFailedThrowable); + } + + LOG.info( + "completed planFiles for {}, startOffset: {}, endOffset: {}, files: {}, rows: {}", + table().name(), + startOffset, + endOffset, + filesInPlan, + rowsInPlan); + return result; + }); + } + + /** + * This needs to be non destructive on the queue as spark could call this multiple times. Each + * time, depending on the table state it could return something different + * + * @param startOffset the starting offset of the next microbatch + * @param limit a limit for how many files/bytes/rows the next microbatch should include + * @return The end offset to use for the next microbatch, null signals that no data is available + */ + @Override + public synchronized StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + LOG.info( + "running latestOffset for {}, startOffset: {}, limit: {}", + table().name(), + startOffset, + limit); + + if (table().currentSnapshot() == null) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() is null"); + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < readConf().streamFromTimestamp()) { + LOG.info("latestOffset returning START_OFFSET, currentSnapshot() < fromTimestamp"); + return StreamingOffset.START_OFFSET; + } + + // if any exceptions were encountered in the background process, raise them here + if (refreshFailedThrowable != null) { + throw new RuntimeException(refreshFailedThrowable); + } + if (fillQueueFailedThrowable != null) { + throw new RuntimeException(fillQueueFailedThrowable); + } + + // if we want to read all available we don't need to scan files, just snapshots + if (limit instanceof ReadAllAvailable) { + // If Trigger.AvailableNow cap is set, return it directly + if (this.lastOffsetForTriggerAvailableNow != null) { + return this.lastOffsetForTriggerAvailableNow; + } + Snapshot lastValidSnapshot = table().snapshot(startOffset.snapshotId()); + Snapshot nextValidSnapshot; + do { + nextValidSnapshot = nextValidSnapshot(lastValidSnapshot); + if (nextValidSnapshot != null) { + lastValidSnapshot = nextValidSnapshot; + } + } while (nextValidSnapshot != null); + return new StreamingOffset( + lastValidSnapshot.snapshotId(), + MicroBatchUtils.addedFilesCount(table(), lastValidSnapshot), + false); + } + + return computeLimitedOffset(limit); + } + + private StreamingOffset computeLimitedOffset(ReadLimit limit) { + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long rowsSeen = 0; + long filesSeen = 0; + LOG.debug( + "latestOffset queue status, queuedFiles: {}, queuedRows: {}", + queuedFileCount.get(), + queuedRowCount.get()); + + List> queueSnapshot = Lists.newArrayList(queue); + Pair queueTail = + queueSnapshot.isEmpty() ? null : queueSnapshot.get(queueSnapshot.size() - 1); + + for (int i = 0; i < queueSnapshot.size(); i++) { + Pair elem = queueSnapshot.get(i); + long fileRows = elem.second().file().recordCount(); + + // Hard limit on files - stop BEFORE exceeding + if (filesSeen + 1 > unpackedLimits.getMaxFiles()) { + if (filesSeen == 0) { + return null; + } + LOG.debug( + "latestOffset hit file limit at {}, rows: {}, files: {}", + elem.first(), + rowsSeen, + filesSeen); + return elem.first(); + } + + // Soft limit on rows - include file FIRST, then check + rowsSeen += fileRows; + filesSeen += 1; + + // Check if we've hit the row limit after including this file + if (rowsSeen >= unpackedLimits.getMaxRows()) { + if (filesSeen == 1 && rowsSeen > unpackedLimits.getMaxRows()) { + LOG.warn( + "File {} at offset {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + elem.second().file().location(), + elem.first(), + fileRows, + unpackedLimits.getMaxRows()); + } + // Return the offset of the NEXT element (or synthesize tail+1) + if (i + 1 < queueSnapshot.size()) { + LOG.debug( + "latestOffset hit row limit at {}, rows: {}, files: {}", + queueSnapshot.get(i + 1).first(), + rowsSeen, + filesSeen); + return queueSnapshot.get(i + 1).first(); + } else { + // This is the last element - return tail+1 + StreamingOffset current = elem.first(); + StreamingOffset result = + new StreamingOffset( + current.snapshotId(), current.position() + 1, current.shouldScanAllFiles()); + LOG.debug( + "latestOffset hit row limit at tail {}, rows: {}, files: {}", + result, + rowsSeen, + filesSeen); + return result; + } + } + } + + // if we got here there aren't enough files to exceed our limits + if (queueTail != null) { + StreamingOffset tailOffset = queueTail.first(); + // we have to increment the position by 1 since we want to include the tail in the read and + // position is non-inclusive + StreamingOffset latestOffset = + new StreamingOffset( + tailOffset.snapshotId(), tailOffset.position() + 1, tailOffset.shouldScanAllFiles()); + LOG.debug("latestOffset returning all queued data {}", latestOffset); + return latestOffset; + } + + // if we got here the queue is empty + LOG.debug("latestOffset no data, returning null"); + return null; + } + + // Background task wrapper that traps exceptions + private void refreshAndTrapException() { + try { + table().refresh(); + } catch (Throwable t) { + LOG.error("Failed to refresh table {}", table().name(), t); + refreshFailedThrowable = t; + } + } + + // Background task wrapper that traps exceptions + private void fillQueueAndTrapException(Snapshot snapshot) { + try { + fillQueue(snapshot); + } catch (Throwable t) { + LOG.error("Failed to fill queue for table {}", table().name(), t); + fillQueueFailedThrowable = t; + } + } + + /** Generate a MicroBatch based on input parameters and add to the queue */ + private void addMicroBatchToQueue( + Snapshot snapshot, long startFileIndex, long endFileIndex, boolean shouldScanAllFile) { + LOG.info("Adding MicroBatch for snapshot: {} to the queue", snapshot.snapshotId()); + MicroBatches.MicroBatch microBatch = + MicroBatches.from(snapshot, table().io()) + .caseSensitive(readConf().caseSensitive()) + .specsById(table().specs()) + .generate(startFileIndex, endFileIndex, Long.MAX_VALUE, shouldScanAllFile); + + long position = startFileIndex; + for (FileScanTask task : microBatch.tasks()) { + Pair elem = + Pair.of(new StreamingOffset(microBatch.snapshotId(), position, shouldScanAllFile), task); + queuedFileCount.incrementAndGet(); + queuedRowCount.addAndGet(task.file().recordCount()); + queue.addLast(elem); + position += 1; + } + if (LOG.isDebugEnabled()) { + StringBuilder sb = new StringBuilder("\n"); + for (Pair elem : queue) { + sb.append(elem.first()).append("\n"); + } + LOG.debug(sb.toString()); + } + lastQueuedSnapshot = snapshot; + } + + private void fillQueue(StreamingOffset fromOffset, StreamingOffset toOffset) { + LOG.debug("filling queue from {}, to: {}", fromOffset, toOffset); + Snapshot currentSnapshot = table().snapshot(fromOffset.snapshotId()); + // this could be a partial snapshot so add it outside the loop + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + fromOffset.position(), + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + fromOffset.shouldScanAllFiles()); + } + if (toOffset != null) { + if (currentSnapshot != null) { + while (currentSnapshot.snapshotId() != toOffset.snapshotId()) { + currentSnapshot = nextValidSnapshot(currentSnapshot); + if (currentSnapshot != null) { + addMicroBatchToQueue( + currentSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), currentSnapshot), + false); + } else { + break; + } + } + } + // toOffset snapshot already added in loop when currentSnapshot == toOffset + } else { + fillQueueInitialBuffer(currentSnapshot); + } + } + + private void fillQueueInitialBuffer(Snapshot startSnapshot) { + // toOffset is null - fill initial buffer to prevent queue starvation before background + // thread starts. Use configured limits to avoid loading all snapshots + // (which could cause OOM on tables with thousands of snapshots). + long targetRows = readConf().asyncQueuePreloadRowLimit(); + long targetFiles = readConf().asyncQueuePreloadFileLimit(); + + Snapshot preloadEndSnapshot = initialPreloadEndSnapshot(); + if (preloadEndSnapshot == null) { + return; // Empty table + } + + // START_OFFSET case: initialize using nextValidSnapshot which respects timestamp filtering + Snapshot current = startSnapshot; + if (current == null) { + current = nextValidSnapshot(null); + if (current != null) { + addMicroBatchToQueue(current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } + } + + // Continue loading more snapshots within safety limits + if (current != null) { + while ((queuedRowCount.get() < targetRows || queuedFileCount.get() < targetFiles) + && current.snapshotId() != preloadEndSnapshot.snapshotId()) { + current = nextValidSnapshot(current); + if (current != null) { + addMicroBatchToQueue( + current, 0, MicroBatchUtils.addedFilesCount(table(), current), false); + } else { + break; + } + } + } + } + + private Snapshot initialPreloadEndSnapshot() { + if (lastOffsetForTriggerAvailableNow != null) { + return table().snapshot(lastOffsetForTriggerAvailableNow.snapshotId()); + } + + return table().currentSnapshot(); + } + + @VisibleForTesting + static boolean reachedAvailableNowCap( + Snapshot readFrom, StreamingOffset lastOffsetForTriggerAvailableNow) { + return lastOffsetForTriggerAvailableNow != null + && readFrom != null + && readFrom.snapshotId() == lastOffsetForTriggerAvailableNow.snapshotId(); + } + + /** Try to populate the queue with data from unread snapshots */ + private void fillQueue(Snapshot readFrom) { + // Don't add beyond cap for Trigger.AvailableNow + if (reachedAvailableNowCap(readFrom, lastOffsetForTriggerAvailableNow)) { + LOG.debug( + "Reached cap snapshot {}, not adding more", + this.lastOffsetForTriggerAvailableNow.snapshotId()); + return; + } + + if ((queuedRowCount.get() > minQueuedRows) || (queuedFileCount.get() > minQueuedFiles)) { + // we have enough data buffered, check back shortly + LOG.debug( + "Buffer is full, {} > {} or {} > {}", + queuedRowCount.get(), + minQueuedRows, + queuedFileCount.get(), + minQueuedFiles); + } else { + // add an entire snapshot to the queue + Snapshot nextValidSnapshot = nextValidSnapshot(readFrom); + if (nextValidSnapshot != null) { + addMicroBatchToQueue( + nextValidSnapshot, + 0, + MicroBatchUtils.addedFilesCount(table(), nextValidSnapshot), + false); + } else { + LOG.debug("No snapshots ready to be read"); + } + } + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..9298c2bbdfcc --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseSparkMicroBatchPlanner.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.Locale; +import org.apache.iceberg.DataOperations; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; +import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +abstract class BaseSparkMicroBatchPlanner implements SparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(BaseSparkMicroBatchPlanner.class); + private final Table table; + private final SparkReadConf readConf; + + BaseSparkMicroBatchPlanner(Table table, SparkReadConf readConf) { + this.table = table; + this.readConf = readConf; + } + + protected Table table() { + return table; + } + + protected SparkReadConf readConf() { + return readConf; + } + + protected boolean shouldProcess(Snapshot snapshot) { + String op = snapshot.operation(); + switch (op) { + case DataOperations.APPEND: + return true; + case DataOperations.REPLACE: + return false; + case DataOperations.DELETE: + Preconditions.checkState( + readConf.streamingSkipDeleteSnapshots(), + "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); + return false; + case DataOperations.OVERWRITE: + Preconditions.checkState( + readConf.streamingSkipOverwriteSnapshots(), + "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", + snapshot.snapshotId(), + SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); + return false; + default: + throw new IllegalStateException( + String.format( + "Cannot process unknown snapshot operation: %s (snapshot id %s)", + op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); + } + } + + /** + * Get the next snapshot skipping over rewrite and delete snapshots. Async must handle nulls. + * + * @param curSnapshot the current snapshot + * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all + * remaining snapshots should be skipped. + */ + protected Snapshot nextValidSnapshot(Snapshot curSnapshot) { + Snapshot nextSnapshot; + // if there were no valid snapshots, check for an initialOffset again + if (curSnapshot == null) { + StreamingOffset startingOffset = + MicroBatchUtils.determineStartingOffset(table, readConf.streamFromTimestamp()); + LOG.debug("determineStartingOffset picked startingOffset: {}", startingOffset); + if (StreamingOffset.START_OFFSET.equals(startingOffset)) { + return null; + } + nextSnapshot = table.snapshot(startingOffset.snapshotId()); + } else { + if (curSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); + } + // skip over rewrite and delete snapshots + while (!shouldProcess(nextSnapshot)) { + LOG.debug("Skipping snapshot: {}", nextSnapshot); + // if the currentSnapShot was also the mostRecentSnapshot then break + // avoids snapshotAfter throwing exception since there are no more snapshots to process + if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { + return null; + } + nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); + } + return nextSnapshot; + } + + static class UnpackedLimits { + private long maxRows = Integer.MAX_VALUE; + private long maxFiles = Integer.MAX_VALUE; + + UnpackedLimits(ReadLimit limit) { + if (limit instanceof CompositeReadLimit) { + ReadLimit[] compositeLimits = ((CompositeReadLimit) limit).getReadLimits(); + for (ReadLimit individualLimit : compositeLimits) { + if (individualLimit instanceof ReadMaxRows) { + ReadMaxRows readMaxRows = (ReadMaxRows) individualLimit; + this.maxRows = Math.min(this.maxRows, readMaxRows.maxRows()); + } else if (individualLimit instanceof ReadMaxFiles) { + ReadMaxFiles readMaxFiles = (ReadMaxFiles) individualLimit; + this.maxFiles = Math.min(this.maxFiles, readMaxFiles.maxFiles()); + } + } + } else if (limit instanceof ReadMaxRows) { + this.maxRows = ((ReadMaxRows) limit).maxRows(); + } else if (limit instanceof ReadMaxFiles) { + this.maxFiles = ((ReadMaxFiles) limit).maxFiles(); + } + } + + public long getMaxRows() { + return maxRows; + } + + public long getMaxFiles() { + return maxFiles; + } + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java new file mode 100644 index 000000000000..7c73e3f416e3 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/MicroBatchUtils.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotChanges; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.util.PropertyUtil; +import org.apache.iceberg.util.SnapshotUtil; + +class MicroBatchUtils { + + private MicroBatchUtils() {} + + static StreamingOffset determineStartingOffset(Table table, long fromTimestamp) { + if (table.currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (fromTimestamp == Long.MIN_VALUE) { + // start from the oldest snapshot, since default value is MIN_VALUE + // avoids looping to find first snapshot + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + + if (table.currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + try { + Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); + if (snapshot != null) { + return new StreamingOffset(snapshot.snapshotId(), 0, false); + } else { + return StreamingOffset.START_OFFSET; + } + } catch (IllegalStateException e) { + // could not determine the first snapshot after the timestamp. use the oldest ancestor instead + return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + } + } + + static long addedFilesCount(Table table, Snapshot snapshot) { + long addedFilesCount = + PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); + return addedFilesCount == -1 + ? Iterables.size( + SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) + : addedFilesCount; + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java new file mode 100644 index 000000000000..1986ddac5d8e --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchPlanner.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.util.List; +import org.apache.iceberg.FileScanTask; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; + +interface SparkMicroBatchPlanner { + /** + * Return the {@link FileScanTask}s for data added between the start and end offsets. + * + * @param startOffset the offset to start planning from + * @param endOffset the offset to plan up to + * @return file scan tasks for data in the offset range + */ + List planFiles(StreamingOffset startOffset, StreamingOffset endOffset); + + /** + * Return the latest offset the stream can advance to from {@code startOffset}, respecting the + * given {@link ReadLimit}. + * + * @param startOffset the current offset of the stream + * @param limit the read limit bounding how far ahead to advance + * @return the latest available offset, or {@code null} if no new data is available + */ + StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit); + + /** Stop the planner and release any resources. */ + void stop(); +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java index d54246e6d513..a1ff767fe2a0 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkMicroBatchStream.java @@ -26,48 +26,32 @@ import java.io.UncheckedIOException; import java.nio.charset.StandardCharsets; import java.util.List; -import java.util.Locale; import java.util.function.Supplier; import org.apache.hadoop.conf.Configuration; import org.apache.iceberg.CombinedScanTask; -import org.apache.iceberg.DataOperations; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.MicroBatches; -import org.apache.iceberg.MicroBatches.MicroBatch; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotChanges; -import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.Table; import org.apache.iceberg.hadoop.HadoopFileIO; import org.apache.iceberg.io.CloseableIterable; -import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.SparkReadConf; -import org.apache.iceberg.spark.SparkReadOptions; import org.apache.iceberg.types.Types; -import org.apache.iceberg.util.Pair; -import org.apache.iceberg.util.PropertyUtil; -import org.apache.iceberg.util.SnapshotUtil; import org.apache.iceberg.util.TableScanUtil; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.connector.read.InputPartition; import org.apache.spark.sql.connector.read.PartitionReaderFactory; -import org.apache.spark.sql.connector.read.streaming.CompositeReadLimit; import org.apache.spark.sql.connector.read.streaming.MicroBatchStream; import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.connector.read.streaming.ReadLimit; -import org.apache.spark.sql.connector.read.streaming.ReadMaxFiles; -import org.apache.spark.sql.connector.read.streaming.ReadMaxRows; import org.apache.spark.sql.connector.read.streaming.SupportsTriggerAvailableNow; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -79,6 +63,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final Table table; private final Supplier fileIO; + private final SparkReadConf readConf; private final String branch; private final boolean caseSensitive; private final String expectedSchema; @@ -89,12 +74,11 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA private final long splitOpenFileCost; private final boolean localityPreferred; private final StreamingOffset initialOffset; - private final boolean skipDelete; - private final boolean skipOverwrite; private final long fromTimestamp; private final int maxFilesPerMicroBatch; private final int maxRecordsPerMicroBatch; private final boolean cacheDeleteFilesOnExecutors; + private SparkMicroBatchPlanner planner; private StreamingOffset lastOffsetForTriggerAvailableNow; SparkMicroBatchStream( @@ -106,6 +90,7 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA String checkpointLocation) { this.table = table; this.fileIO = fileIO; + this.readConf = readConf; this.branch = readConf.branch(); this.caseSensitive = readConf.caseSensitive(); this.expectedSchema = SchemaParser.toJson(expectedSchema); @@ -124,9 +109,6 @@ public class SparkMicroBatchStream implements MicroBatchStream, SupportsTriggerA new InitialOffsetStore( table, checkpointLocation, fromTimestamp, sparkContext.hadoopConfiguration()); this.initialOffset = initialOffsetStore.initialOffset(); - - this.skipDelete = readConf.streamingSkipDeleteSnapshots(); - this.skipOverwrite = readConf.streamingSkipOverwriteSnapshots(); } @Override @@ -141,8 +123,8 @@ public Offset latestOffset() { } Snapshot latestSnapshot = table.currentSnapshot(); - - return new StreamingOffset(latestSnapshot.snapshotId(), addedFilesCount(latestSnapshot), false); + return new StreamingOffset( + latestSnapshot.snapshotId(), MicroBatchUtils.addedFilesCount(table, latestSnapshot), false); } @Override @@ -161,7 +143,11 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { StreamingOffset endOffset = (StreamingOffset) end; StreamingOffset startOffset = (StreamingOffset) start; - List fileScanTasks = planFiles(startOffset, endOffset); + if (planner == null) { + initializePlanner(startOffset, endOffset); + } + + List fileScanTasks = planner.planFiles(startOffset, endOffset); CloseableIterable splitTasks = TableScanUtil.splitFiles(CloseableIterable.withNoopClose(fileScanTasks), splitSize); @@ -171,7 +157,6 @@ public InputPartition[] planInputPartitions(Offset start, Offset end) { String[][] locations = computePreferredLocations(combinedScanTasks); InputPartition[] partitions = new InputPartition[combinedScanTasks.size()]; - for (int index = 0; index < combinedScanTasks.size(); index++) { partitions[index] = new SparkInputPartition( @@ -214,318 +199,35 @@ public Offset deserializeOffset(String json) { public void commit(Offset end) {} @Override - public void stop() {} - - private List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { - List fileScanTasks = Lists.newArrayList(); - StreamingOffset batchStartOffset = - StreamingOffset.START_OFFSET.equals(startOffset) - ? determineStartingOffset(table, fromTimestamp) - : startOffset; - - StreamingOffset currentOffset = null; - - // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) - do { - long endFileIndex; - if (currentOffset == null) { - currentOffset = batchStartOffset; - } else { - Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table, currentOffset.snapshotId()); - // it may happen that we need to read this snapshot partially in case it's equal to - // endOffset. - if (currentOffset.snapshotId() != endOffset.snapshotId()) { - currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); - } else { - currentOffset = endOffset; - } - } - - Snapshot snapshot = table.snapshot(currentOffset.snapshotId()); - - validateCurrentSnapshotExists(snapshot, currentOffset); - - if (!shouldProcess(snapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table.name()); - continue; - } - - Snapshot currentSnapshot = table.snapshot(currentOffset.snapshotId()); - if (currentOffset.snapshotId() == endOffset.snapshotId()) { - endFileIndex = endOffset.position(); - } else { - endFileIndex = addedFilesCount(currentSnapshot); - } - - MicroBatch latestMicroBatch = - MicroBatches.from(currentSnapshot, table.io()) - .caseSensitive(caseSensitive) - .specsById(table.specs()) - .generate( - currentOffset.position(), - endFileIndex, - Long.MAX_VALUE, - currentOffset.shouldScanAllFiles()); - - fileScanTasks.addAll(latestMicroBatch.tasks()); - } while (currentOffset.snapshotId() != endOffset.snapshotId()); - - return fileScanTasks; - } - - private boolean shouldProcess(Snapshot snapshot) { - String op = snapshot.operation(); - switch (op) { - case DataOperations.APPEND: - return true; - case DataOperations.REPLACE: - return false; - case DataOperations.DELETE: - Preconditions.checkState( - skipDelete, - "Cannot process delete snapshot: %s, to ignore deletes, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_DELETE_SNAPSHOTS); - return false; - case DataOperations.OVERWRITE: - Preconditions.checkState( - skipOverwrite, - "Cannot process overwrite snapshot: %s, to ignore overwrites, set %s=true", - snapshot.snapshotId(), - SparkReadOptions.STREAMING_SKIP_OVERWRITE_SNAPSHOTS); - return false; - default: - throw new IllegalStateException( - String.format( - "Cannot process unknown snapshot operation: %s (snapshot id %s)", - op.toLowerCase(Locale.ROOT), snapshot.snapshotId())); - } - } - - private static StreamingOffset determineStartingOffset(Table table, Long fromTimestamp) { - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (fromTimestamp == null) { - // match existing behavior and start from the oldest snapshot - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; - } - - try { - Snapshot snapshot = SnapshotUtil.oldestAncestorAfter(table, fromTimestamp); - if (snapshot != null) { - return new StreamingOffset(snapshot.snapshotId(), 0, false); - } else { - return StreamingOffset.START_OFFSET; - } - } catch (IllegalStateException e) { - // could not determine the first snapshot after the timestamp. use the oldest ancestor instead - return new StreamingOffset(SnapshotUtil.oldestAncestor(table).snapshotId(), 0, false); + public void stop() { + if (planner != null) { + planner.stop(); } } - private static int getMaxFiles(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) readLimit).maxFiles(); - } - - if (readLimit instanceof CompositeReadLimit) { - // We do not expect a CompositeReadLimit to contain a nested CompositeReadLimit. - // In fact, it should only be a composite of two or more of ReadMinRows, ReadMaxRows and - // ReadMaxFiles, with no more than one of each. - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxFiles) { - return ((ReadMaxFiles) limit).maxFiles(); - } - } - } - - // there is no ReadMaxFiles, so return the default - return Integer.MAX_VALUE; - } - - private static int getMaxRows(ReadLimit readLimit) { - if (readLimit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) readLimit).maxRows(); - return Math.toIntExact(maxRows); - } - - if (readLimit instanceof CompositeReadLimit) { - ReadLimit[] limits = ((CompositeReadLimit) readLimit).getReadLimits(); - for (ReadLimit limit : limits) { - if (limit instanceof ReadMaxRows) { - long maxRows = ((ReadMaxRows) limit).maxRows(); - return Math.toIntExact(maxRows); - } - } + private void initializePlanner(StreamingOffset startOffset, StreamingOffset endOffset) { + if (readConf.asyncMicroBatchPlanningEnabled()) { + this.planner = + new AsyncSparkMicroBatchPlanner( + table, readConf, startOffset, endOffset, lastOffsetForTriggerAvailableNow); + } else { + this.planner = + new SyncSparkMicroBatchPlanner(table, readConf, lastOffsetForTriggerAvailableNow); } - - // there is no ReadMaxRows, so return the default - return Integer.MAX_VALUE; } @Override - @SuppressWarnings("checkstyle:CyclomaticComplexity") public Offset latestOffset(Offset startOffset, ReadLimit limit) { - // calculate end offset get snapshotId from the startOffset Preconditions.checkArgument( startOffset instanceof StreamingOffset, "Invalid start offset: %s is not a StreamingOffset", startOffset); - table.refresh(); - if (table.currentSnapshot() == null) { - return StreamingOffset.START_OFFSET; - } - - if (table.currentSnapshot().timestampMillis() < fromTimestamp) { - return StreamingOffset.START_OFFSET; + if (planner == null) { + initializePlanner((StreamingOffset) startOffset, null); } - // end offset can expand to multiple snapshots - StreamingOffset startingOffset = (StreamingOffset) startOffset; - - if (startOffset.equals(StreamingOffset.START_OFFSET)) { - startingOffset = determineStartingOffset(table, fromTimestamp); - } - - Snapshot curSnapshot = table.snapshot(startingOffset.snapshotId()); - validateCurrentSnapshotExists(curSnapshot, startingOffset); - - // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. - long latestSnapshotId = - lastOffsetForTriggerAvailableNow != null - ? lastOffsetForTriggerAvailableNow.snapshotId() - : table.currentSnapshot().snapshotId(); - - int startPosOfSnapOffset = (int) startingOffset.position(); - - boolean scanAllFiles = startingOffset.shouldScanAllFiles(); - - boolean shouldContinueReading = true; - int curFilesAdded = 0; - long curRecordCount = 0; - int curPos = 0; - - // Note : we produce nextOffset with pos as non-inclusive - while (shouldContinueReading) { - // generate manifest index for the curSnapshot - List> indexedManifests = - MicroBatches.skippedManifestIndexesFromSnapshot( - table.io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); - // this is under assumption we will be able to add at-least 1 file in the new offset - for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { - // be rest assured curPos >= startFileIndex - curPos = indexedManifests.get(idx).second(); - try (CloseableIterable taskIterable = - MicroBatches.openManifestFile( - table.io(), - table.specs(), - caseSensitive, - curSnapshot, - indexedManifests.get(idx).first(), - scanAllFiles); - CloseableIterator taskIter = taskIterable.iterator()) { - while (taskIter.hasNext()) { - FileScanTask task = taskIter.next(); - if (curPos >= startPosOfSnapOffset) { - if ((curFilesAdded + 1) > getMaxFiles(limit)) { - // On including the file it might happen that we might exceed, the configured - // soft limit on the number of records, since this is a soft limit its acceptable. - shouldContinueReading = false; - break; - } - - curFilesAdded += 1; - curRecordCount += task.file().recordCount(); - - if (curRecordCount >= getMaxRows(limit)) { - // we included the file, so increment the number of files - // read in the current snapshot. - ++curPos; - shouldContinueReading = false; - break; - } - } - ++curPos; - } - } catch (IOException ioe) { - LOG.warn("Failed to close task iterable", ioe); - } - } - // if the currentSnapShot was also the latestSnapshot then break - if (curSnapshot.snapshotId() == latestSnapshotId) { - break; - } - - // if everything was OK and we consumed complete snapshot then move to next snapshot - if (shouldContinueReading) { - Snapshot nextValid = nextValidSnapshot(curSnapshot); - if (nextValid == null) { - // nextValid implies all the remaining snapshots should be skipped. - break; - } - // we found the next available snapshot, continue from there. - curSnapshot = nextValid; - startPosOfSnapOffset = -1; - // if anyhow we are moving to next snapshot we should only scan addedFiles - scanAllFiles = false; - } - } - - StreamingOffset latestStreamingOffset = - new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); - - // if no new data arrived, then return null. - return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; - } - - /** - * Get the next snapshot skiping over rewrite and delete snapshots. - * - * @param curSnapshot the current snapshot - * @return the next valid snapshot (not a rewrite or delete snapshot), returns null if all - * remaining snapshots should be skipped. - */ - private Snapshot nextValidSnapshot(Snapshot curSnapshot) { - Snapshot nextSnapshot = SnapshotUtil.snapshotAfter(table, curSnapshot.snapshotId()); - // skip over rewrite and delete snapshots - while (!shouldProcess(nextSnapshot)) { - LOG.debug("Skipping snapshot: {} of table {}", nextSnapshot.snapshotId(), table.name()); - // if the currentSnapShot was also the mostRecentSnapshot then break - if (nextSnapshot.snapshotId() == table.currentSnapshot().snapshotId()) { - return null; - } - nextSnapshot = SnapshotUtil.snapshotAfter(table, nextSnapshot.snapshotId()); - } - return nextSnapshot; - } - - private long addedFilesCount(Snapshot snapshot) { - long addedFilesCount = - PropertyUtil.propertyAsLong(snapshot.summary(), SnapshotSummary.ADDED_FILES_PROP, -1); - // If snapshotSummary doesn't have SnapshotSummary.ADDED_FILES_PROP, - // iterate through addedFiles iterator to find addedFilesCount. - return addedFilesCount == -1 - ? Iterables.size( - SnapshotChanges.builderFor(table).snapshot(snapshot).build().addedDataFiles()) - : addedFilesCount; - } - - private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { - if (snapshot == null) { - throw new IllegalStateException( - String.format( - Locale.ROOT, - "Cannot load current offset at snapshot %d, the snapshot was expired or removed", - currentOffset.snapshotId())); - } + return planner.latestOffset((StreamingOffset) startOffset, limit); } @Override @@ -553,6 +255,11 @@ public void prepareForTriggerAvailableNow() { (StreamingOffset) latestOffset(initialOffset, ReadLimit.allAvailable()); LOG.info("lastOffset for Trigger.AvailableNow is {}", lastOffsetForTriggerAvailableNow.json()); + + if (planner != null) { + planner.stop(); + planner = null; + } } private static class InitialOffsetStore { @@ -576,7 +283,7 @@ public StreamingOffset initialOffset() { } table.refresh(); - StreamingOffset offset = determineStartingOffset(table, fromTimestamp); + StreamingOffset offset = MicroBatchUtils.determineStartingOffset(table, fromTimestamp); OutputFile outputFile = io.newOutputFile(initialOffsetLocation); writeOffset(offset, outputFile); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..f1b0029c5432 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SyncSparkMicroBatchPlanner.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.MicroBatches; +import org.apache.iceberg.MicroBatches.MicroBatch; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.Table; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.CloseableIterator; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.spark.SparkReadConf; +import org.apache.iceberg.util.Pair; +import org.apache.iceberg.util.SnapshotUtil; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +class SyncSparkMicroBatchPlanner extends BaseSparkMicroBatchPlanner { + private static final Logger LOG = LoggerFactory.getLogger(SyncSparkMicroBatchPlanner.class); + + private final boolean caseSensitive; + private final long fromTimestamp; + private final StreamingOffset lastOffsetForTriggerAvailableNow; + + SyncSparkMicroBatchPlanner( + Table table, SparkReadConf readConf, StreamingOffset lastOffsetForTriggerAvailableNow) { + super(table, readConf); + this.caseSensitive = readConf().caseSensitive(); + this.fromTimestamp = readConf().streamFromTimestamp(); + this.lastOffsetForTriggerAvailableNow = lastOffsetForTriggerAvailableNow; + } + + @Override + public List planFiles(StreamingOffset startOffset, StreamingOffset endOffset) { + List fileScanTasks = Lists.newArrayList(); + StreamingOffset batchStartOffset = + StreamingOffset.START_OFFSET.equals(startOffset) + ? MicroBatchUtils.determineStartingOffset(table(), fromTimestamp) + : startOffset; + + StreamingOffset currentOffset = null; + + // [(startOffset : startFileIndex), (endOffset : endFileIndex) ) + do { + long endFileIndex; + if (currentOffset == null) { + currentOffset = batchStartOffset; + } else { + Snapshot snapshotAfter = SnapshotUtil.snapshotAfter(table(), currentOffset.snapshotId()); + // it may happen that we need to read this snapshot partially in case it's equal to + // endOffset. + if (currentOffset.snapshotId() != endOffset.snapshotId()) { + currentOffset = new StreamingOffset(snapshotAfter.snapshotId(), 0L, false); + } else { + currentOffset = endOffset; + } + } + + Snapshot snapshot = table().snapshot(currentOffset.snapshotId()); + + validateCurrentSnapshotExists(snapshot, currentOffset); + + if (!shouldProcess(snapshot)) { + LOG.debug("Skipping snapshot: {} of table {}", currentOffset.snapshotId(), table().name()); + continue; + } + + Snapshot currentSnapshot = table().snapshot(currentOffset.snapshotId()); + if (currentOffset.snapshotId() == endOffset.snapshotId()) { + endFileIndex = endOffset.position(); + } else { + endFileIndex = MicroBatchUtils.addedFilesCount(table(), currentSnapshot); + } + + MicroBatch latestMicroBatch = + MicroBatches.from(currentSnapshot, table().io()) + .caseSensitive(caseSensitive) + .specsById(table().specs()) + .generate( + currentOffset.position(), + endFileIndex, + Long.MAX_VALUE, + currentOffset.shouldScanAllFiles()); + + fileScanTasks.addAll(latestMicroBatch.tasks()); + } while (currentOffset.snapshotId() != endOffset.snapshotId()); + + return fileScanTasks; + } + + @Override + @SuppressWarnings("checkstyle:CyclomaticComplexity") + public StreamingOffset latestOffset(StreamingOffset startOffset, ReadLimit limit) { + table().refresh(); + if (table().currentSnapshot() == null) { + return StreamingOffset.START_OFFSET; + } + + if (table().currentSnapshot().timestampMillis() < fromTimestamp) { + return StreamingOffset.START_OFFSET; + } + + // end offset can expand to multiple snapshots + StreamingOffset startingOffset = startOffset; + + if (startOffset.equals(StreamingOffset.START_OFFSET)) { + startingOffset = MicroBatchUtils.determineStartingOffset(table(), fromTimestamp); + } + + Snapshot curSnapshot = table().snapshot(startingOffset.snapshotId()); + validateCurrentSnapshotExists(curSnapshot, startingOffset); + + // Use the pre-computed snapshotId when Trigger.AvailableNow is enabled. + long latestSnapshotId = + lastOffsetForTriggerAvailableNow != null + ? lastOffsetForTriggerAvailableNow.snapshotId() + : table().currentSnapshot().snapshotId(); + + int startPosOfSnapOffset = (int) startingOffset.position(); + + boolean scanAllFiles = startingOffset.shouldScanAllFiles(); + + boolean shouldContinueReading = true; + int curFilesAdded = 0; + long curRecordCount = 0; + int curPos = 0; + + // Extract limits once to avoid repeated calls in tight loop + UnpackedLimits unpackedLimits = new UnpackedLimits(limit); + long maxFiles = unpackedLimits.getMaxFiles(); + long maxRows = unpackedLimits.getMaxRows(); + + // Note : we produce nextOffset with pos as non-inclusive + while (shouldContinueReading) { + // generate manifest index for the curSnapshot + List> indexedManifests = + MicroBatches.skippedManifestIndexesFromSnapshot( + table().io(), curSnapshot, startPosOfSnapOffset, scanAllFiles); + // this is under assumption we will be able to add at-least 1 file in the new offset + for (int idx = 0; idx < indexedManifests.size() && shouldContinueReading; idx++) { + // be rest assured curPos >= startFileIndex + curPos = indexedManifests.get(idx).second(); + try (CloseableIterable taskIterable = + MicroBatches.openManifestFile( + table().io(), + table().specs(), + caseSensitive, + curSnapshot, + indexedManifests.get(idx).first(), + scanAllFiles); + CloseableIterator taskIter = taskIterable.iterator()) { + while (taskIter.hasNext()) { + FileScanTask task = taskIter.next(); + if (curPos >= startPosOfSnapOffset) { + if ((curFilesAdded + 1) > maxFiles) { + // On including the file it might happen that we might exceed, the configured + // soft limit on the number of records, since this is a soft limit its acceptable. + shouldContinueReading = false; + break; + } + + curFilesAdded += 1; + curRecordCount += task.file().recordCount(); + + if (curRecordCount >= maxRows) { + // we included the file, so increment the number of files + // read in the current snapshot. + if (curFilesAdded == 1 && curRecordCount > maxRows) { + LOG.warn( + "File {} contains {} records, exceeding maxRecordsPerMicroBatch limit of {}. " + + "This file will be processed entirely to guarantee forward progress. " + + "Consider increasing the limit or writing smaller files to avoid unexpected memory usage.", + task.file().location(), + task.file().recordCount(), + maxRows); + } + ++curPos; + shouldContinueReading = false; + break; + } + } + ++curPos; + } + } catch (IOException ioe) { + LOG.warn("Failed to close task iterable", ioe); + } + } + // if the currentSnapShot was also the latestSnapshot then break + if (curSnapshot.snapshotId() == latestSnapshotId) { + break; + } + + // if everything was OK and we consumed complete snapshot then move to next snapshot + if (shouldContinueReading) { + Snapshot nextValid = nextValidSnapshot(curSnapshot); + if (nextValid == null) { + // nextValid implies all the remaining snapshots should be skipped. + break; + } + // we found the next available snapshot, continue from there. + curSnapshot = nextValid; + startPosOfSnapOffset = -1; + // if anyhow we are moving to next snapshot we should only scan addedFiles + scanAllFiles = false; + } + } + + StreamingOffset latestStreamingOffset = + new StreamingOffset(curSnapshot.snapshotId(), curPos, scanAllFiles); + + // if no new data arrived, then return null. + return latestStreamingOffset.equals(startingOffset) ? null : latestStreamingOffset; + } + + @Override + public void stop() {} + + private void validateCurrentSnapshotExists(Snapshot snapshot, StreamingOffset currentOffset) { + if (snapshot == null) { + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Cannot load current offset at snapshot %d, the snapshot was expired or removed", + currentOffset.snapshotId())); + } + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java new file mode 100644 index 000000000000..b6017e2001e7 --- /dev/null +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestAsyncSparkMicroBatchPlanner.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import org.apache.iceberg.Snapshot; +import org.junit.jupiter.api.Test; + +class TestAsyncSparkMicroBatchPlanner { + + @Test + void reachedAvailableNowCapReturnsTrueOnlyForExactCapSnapshot() { + Snapshot capSnapshot = mockSnapshot(10L); + Snapshot laterSnapshotWithHigherId = mockSnapshot(20L); + Snapshot laterSnapshotWithLowerId = mockSnapshot(5L); + StreamingOffset capOffset = new StreamingOffset(10L, 3L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(capSnapshot, capOffset)).isTrue(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap( + laterSnapshotWithHigherId, capOffset)) + .isFalse(); + assertThat( + AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(laterSnapshotWithLowerId, capOffset)) + .isFalse(); + } + + @Test + void reachedAvailableNowCapReturnsFalseWhenCapOrSnapshotIsMissing() { + Snapshot readFrom = mockSnapshot(10L); + StreamingOffset capOffset = new StreamingOffset(10L, 1L, false); + + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(readFrom, null)).isFalse(); + assertThat(AsyncSparkMicroBatchPlanner.reachedAvailableNowCap(null, capOffset)).isFalse(); + } + + private Snapshot mockSnapshot(long snapshotId) { + Snapshot snapshot = mock(Snapshot.class); + when(snapshot.snapshotId()).thenReturn(snapshotId); + return snapshot; + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java new file mode 100644 index 000000000000..a9ce340fd4ec --- /dev/null +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestMicroBatchPlanningUtils.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.Table; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.spark.sql.connector.read.streaming.ReadLimit; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; +import org.junit.jupiter.api.extension.ExtendWith; + +@ExtendWith(ParameterizedTestExtension.class) +public class TestMicroBatchPlanningUtils extends CatalogTestBase { + + private Table table; + + @BeforeEach + public void setupTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + sql( + "CREATE TABLE %s " + + "(id INT, data STRING) " + + "USING iceberg " + + "PARTITIONED BY (bucket(3, id))", + tableName); + this.table = validationCatalog.loadTable(tableIdent); + } + + @AfterEach + public void dropTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + @TestTemplate + public void testUnpackedLimitsCompositeChoosesMinimum() { + ReadLimit[] limits = + new ReadLimit[] { + ReadLimit.maxRows(10), ReadLimit.maxRows(4), ReadLimit.maxFiles(8), ReadLimit.maxFiles(2) + }; + + ReadLimit composite = ReadLimit.compositeLimit(limits); + + BaseSparkMicroBatchPlanner.UnpackedLimits unpacked = + new BaseSparkMicroBatchPlanner.UnpackedLimits(composite); + + assertThat(unpacked.getMaxRows()).isEqualTo(4); + assertThat(unpacked.getMaxFiles()).isEqualTo(2); + } + + @TestTemplate + public void testDetermineStartingOffsetWithTimestampBetweenSnapshots() { + sql("INSERT INTO %s VALUES (1, 'one')", tableName); + table.refresh(); + long snapshot1Time = table.currentSnapshot().timestampMillis(); + + sql("INSERT INTO %s VALUES (2, 'two')", tableName); + table.refresh(); + long snapshot2Id = table.currentSnapshot().snapshotId(); + + StreamingOffset offset = MicroBatchUtils.determineStartingOffset(table, snapshot1Time + 1); + + assertThat(offset.snapshotId()).isEqualTo(snapshot2Id); + assertThat(offset.position()).isEqualTo(0L); + assertThat(offset.shouldScanAllFiles()).isFalse(); + } + + @TestTemplate + public void testAddedFilesCountUsesSummaryWhenPresent() { + sql("INSERT INTO %s VALUES (1, 'one')", tableName); + table.refresh(); + + long expectedAddedFiles = + Long.parseLong(table.currentSnapshot().summary().get(SnapshotSummary.ADDED_FILES_PROP)); + + long actual = MicroBatchUtils.addedFilesCount(table, table.currentSnapshot()); + + assertThat(actual).isEqualTo(expectedAddedFiles); + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java index 80f2c6864051..5f9b460f3707 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead3.java @@ -31,13 +31,17 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.IntStream; import org.apache.iceberg.BaseTable; +import org.apache.iceberg.CatalogProperties; import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.DataOperations; import org.apache.iceberg.DeleteFile; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Files; +import org.apache.iceberg.Parameter; import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; import org.apache.iceberg.RewriteFiles; import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; @@ -50,15 +54,22 @@ import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkReadConf; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.VoidFunction2; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; +import org.apache.spark.sql.connector.read.InputPartition; +import org.apache.spark.sql.connector.read.streaming.Offset; import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.streaming.DataStreamWriter; import org.apache.spark.sql.streaming.OutputMode; @@ -73,10 +84,73 @@ @ExtendWith(ParameterizedTestExtension.class) public final class TestStructuredStreamingRead3 extends CatalogTestBase { + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}, async = {3}") + public static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + false + }, + { + SparkCatalogConfig.HIVE.catalogName(), + SparkCatalogConfig.HIVE.implementation(), + SparkCatalogConfig.HIVE.properties(), + true + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + false + }, + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties(), + true + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + false + }, + { + SparkCatalogConfig.REST.catalogName(), + SparkCatalogConfig.REST.implementation(), + ImmutableMap.builder() + .putAll(SparkCatalogConfig.REST.properties()) + .put(CatalogProperties.URI, restCatalog.properties().get(CatalogProperties.URI)) + .build(), + true + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + false + }, + { + SparkCatalogConfig.SPARK_SESSION.catalogName(), + SparkCatalogConfig.SPARK_SESSION.implementation(), + SparkCatalogConfig.SPARK_SESSION.properties(), + true + } + }; + } + private Table table; private final AtomicInteger microBatches = new AtomicInteger(); + @Parameter(index = 3) + private Boolean async; + /** * test data to be used by multiple writes each write creates a snapshot and writes a list of * records @@ -197,8 +271,7 @@ public void testReadStreamWithMaxRows1() throws Exception { Trigger.AvailableNow()); // soft limit of 1 is being enforced, the stream is not blocked. - StreamingQuery query = - startStream(ImmutableMap.of(SparkReadOptions.STREAMING_MAX_ROWS_PER_MICRO_BATCH, "1")); + StreamingQuery query = startStream(SparkReadOptions.STREAMING_MAX_ROWS_PER_MICRO_BATCH, "1"); // check answer correctness only 1 record read the micro-batch will be stuck List actual = rowsAvailable(query); @@ -258,15 +331,41 @@ public void testReadStreamWithCompositeReadLimit() throws Exception { Trigger.AvailableNow()); } + @TestTemplate + public void testReadStreamWithLowAsyncQueuePreload() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + // Set low preload limits to test async queue behavior - background thread should load + // remaining data + + StreamingQuery query = + startStream( + ImmutableMap.of( + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "5", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "5")); + + List actual = rowsAvailable(query); + assertThat(actual) + .containsExactlyInAnyOrderElementsOf(Iterables.concat(TEST_DATA_MULTIPLE_SNAPSHOTS)); + } + @TestTemplate public void testAvailableNowStreamReadShouldNotHangOrReprocessData() throws Exception { File writerCheckpointFolder = temp.resolve("writer-checkpoint-folder").toFile(); File writerCheckpoint = new File(writerCheckpointFolder, "writer-checkpoint"); File output = temp.resolve("junit").toFile(); + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + DataStreamWriter querySource = spark .readStream() + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -321,10 +420,17 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex long expectedSnapshotId = table.currentSnapshot().snapshotId(); String sinkTable = "availablenow_sink"; + Map options = Maps.newHashMap(); + options.put(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"); + options.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + options.put(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + StreamingQuery query = spark .readStream() - .option(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1") + .options(options) .format("iceberg") .load(tableName) .writeStream() @@ -366,6 +472,142 @@ public void testTriggerAvailableNowDoesNotProcessNewDataWhileRunning() throws Ex assertThat(actualResults).containsExactlyInAnyOrderElementsOf(Iterables.concat(expectedData)); } + @TestTemplate + public void testTriggerAvailableNowCapsAsyncPreloadAfterPrepare() { + List> initialData = + List.of(List.of(new SimpleRecord(1, "one")), List.of(new SimpleRecord(2, "two"))); + appendDataAsMultipleSnapshots(initialData); + + table.refresh(); + long expectedCapSnapshotId = table.currentSnapshot().snapshotId(); + + SparkMicroBatchStream stream = + new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf( + spark, + table, + ImmutableMap.of( + SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, + async.toString(), + SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, + "1", + SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, + "1", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_FILE_LIMIT, + "10", + SparkReadOptions.ASYNC_QUEUE_PRELOAD_ROW_LIMIT, + "10")), + table.schema(), + temp.resolve("available-now-cap-checkpoint").toString()); + + try { + stream.prepareForTriggerAvailableNow(); + + appendData(List.of(new SimpleRecord(3, "three"))); + + Offset startOffset = stream.initialOffset(); + Offset firstEndOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + assertThat(firstEndOffset).isNotNull(); + stream.planInputPartitions(startOffset, firstEndOffset); + + Offset secondEndOffset = stream.latestOffset(firstEndOffset, stream.getDefaultReadLimit()); + assertThat(secondEndOffset).isNotNull(); + stream.planInputPartitions(firstEndOffset, secondEndOffset); + + assertThat(stream.latestOffset(secondEndOffset, stream.getDefaultReadLimit())).isNull(); + assertThat(((StreamingOffset) secondEndOffset).snapshotId()).isEqualTo(expectedCapSnapshotId); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testLatestOffsetReturnsNullAfterFinalBatchIsConsumed() throws Exception { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + table.refresh(); + int expectedBatchCount; + try (CloseableIterable tasks = table.newScan().planFiles()) { + expectedBatchCount = Iterables.size(tasks); + } + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "drain-to-null-checkpoint"); + + try { + int plannedBatchCount = 0; + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + InputPartition[] partitions = stream.planInputPartitions(startOffset, endOffset); + assertThat(partitions).isNotEmpty(); + plannedBatchCount += 1; + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + assertThat(plannedBatchCount).isEqualTo(expectedBatchCount); + } finally { + stream.stop(); + } + } + + @TestTemplate + public void testPlanInputPartitionsIsIdempotentForSameOffsets() { + appendDataAsMultipleSnapshots(TEST_DATA_MULTIPLE_SNAPSHOTS); + + SparkMicroBatchStream stream = + newMicroBatchStream( + ImmutableMap.of(SparkReadOptions.STREAMING_MAX_FILES_PER_MICRO_BATCH, "1"), + "idempotent-plan-files-checkpoint"); + + try { + Offset startOffset = stream.initialOffset(); + Offset endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + + assertThat(endOffset).isNotNull(); + + InputPartition[] firstPartitions = stream.planInputPartitions(startOffset, endOffset); + InputPartition[] secondPartitions = stream.planInputPartitions(startOffset, endOffset); + + List firstFileLocations = Lists.newArrayList(); + for (InputPartition partition : firstPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + firstFileLocations.add(task.file().location()); + } + } + + List secondFileLocations = Lists.newArrayList(); + for (InputPartition partition : secondPartitions) { + SparkInputPartition sparkInputPartition = (SparkInputPartition) partition; + for (FileScanTask task : sparkInputPartition.taskGroup().tasks()) { + secondFileLocations.add(task.file().location()); + } + } + + assertThat(firstFileLocations).containsExactlyInAnyOrderElementsOf(secondFileLocations); + + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + while (endOffset != null) { + assertThat(stream.planInputPartitions(startOffset, endOffset)).isNotEmpty(); + startOffset = endOffset; + endOffset = stream.latestOffset(startOffset, stream.getDefaultReadLimit()); + } + + assertThat(endOffset).isNull(); + } finally { + stream.stop(); + } + } + @TestTemplate public void testReadStreamOnIcebergThenAddData() throws Exception { List> expected = TEST_DATA_MULTIPLE_SNAPSHOTS; @@ -433,6 +675,8 @@ public void testReadingStreamFromFutureTimetsamp() throws Exception { // Data appended after the timestamp should appear appendData(data); + // Allow async background thread to refresh, else test sometimes fails + Thread.sleep(50); actual = rowsAvailable(query); assertThat(actual).containsExactlyInAnyOrderElementsOf(data); } @@ -885,13 +1129,18 @@ private void appendData(List data, String format) { private static final String MEMORY_TABLE = "_stream_view_mem"; private StreamingQuery startStream(Map options) throws TimeoutException { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } return spark .readStream() - .options(options) + .options(allOptions) .format("iceberg") .load(tableName) .writeStream() - .options(options) + .options(allOptions) .format("memory") .queryName(MEMORY_TABLE) .outputMode(OutputMode.Append()) @@ -916,11 +1165,17 @@ private void assertMicroBatchRecordSizes( private void assertMicroBatchRecordSizes( Map options, List expectedMicroBatchRecordSize, Trigger trigger) throws TimeoutException { - Dataset ds = spark.readStream().options(options).format("iceberg").load(tableName); + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + Dataset ds = spark.readStream().options(allOptions).format("iceberg").load(tableName); List syncList = Collections.synchronizedList(Lists.newArrayList()); ds.writeStream() - .options(options) + .options(allOptions) .trigger(trigger) .foreachBatch( (VoidFunction2, Long>) @@ -941,4 +1196,21 @@ private List rowsAvailable(StreamingQuery query) { .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); } + + private SparkMicroBatchStream newMicroBatchStream( + Map options, String checkpointDirName) { + Map allOptions = Maps.newHashMap(options); + allOptions.put(SparkReadOptions.ASYNC_MICRO_BATCH_PLANNING_ENABLED, async.toString()); + if (async) { + allOptions.putIfAbsent(SparkReadOptions.STREAMING_SNAPSHOT_POLLING_INTERVAL_MS, "1"); + } + + return new SparkMicroBatchStream( + JavaSparkContext.fromSparkContext(spark.sparkContext()), + table, + table::io, + new SparkReadConf(spark, table, allOptions), + table.schema(), + temp.resolve(checkpointDirName).toString()); + } } From b07435ba4a5b93312a71a8541c60cbdd7aa324fa Mon Sep 17 00:00:00 2001 From: Talat UYARER Date: Wed, 29 Apr 2026 18:45:28 -0700 Subject: [PATCH 130/197] Site: Remove Iceberg Summit 2026 section as the event has passed (#16166) --- site/docs/assets/stylesheets/home.css | 50 --------------------------- site/overrides/home.html | 49 -------------------------- 2 files changed, 99 deletions(-) diff --git a/site/docs/assets/stylesheets/home.css b/site/docs/assets/stylesheets/home.css index 1c45c0e72025..98166fdfc0d2 100644 --- a/site/docs/assets/stylesheets/home.css +++ b/site/docs/assets/stylesheets/home.css @@ -30,42 +30,6 @@ text-align: center; /* Center text horizontally */ } -/* Summit Box Styles */ -.summit-box { - margin: 20px auto 15px auto; - padding: 10px 15px 15px 15px; - max-width: 600px; - background: rgba(255, 255, 255, 0.05); - border-radius: 12px; - border: 2px solid rgba(255, 255, 255, 0.2); - box-shadow: 0 4px 15px rgba(0, 0, 0, 0.1); - backdrop-filter: blur(10px); -} - -.summit-box h4 { - text-align: center; - font-weight: 600; - font-size: 20px; - margin-top: 0; - margin-bottom: 10px; -} - -.summit-link-item { - transition: transform 0.3s ease, box-shadow 0.3s ease; - box-shadow: 0 3px 8px rgba(0, 0, 0, 0.2); -} - -.summit-link-item:hover { - transform: translateY(-3px); - box-shadow: 0 6px 15px rgba(0, 0, 0, 0.3); -} - -.summit-link-item a { - display: block; - width: 100%; - height: 100%; -} - /* Media query for smaller screens */ @media (max-width: 767px) { .col-6 { @@ -78,18 +42,4 @@ div#termynal-expressive-sql { left: 0; } - - .summit-box { - max-width: 100%; - margin: 15px 10px; - } - - .summit-links { - flex-direction: column !important; - } - - .summit-link-item { - min-width: 100% !important; - max-width: 100% !important; - } } diff --git a/site/overrides/home.html b/site/overrides/home.html index 8d6e49176963..65d971e0a134 100644 --- a/site/overrides/home.html +++ b/site/overrides/home.html @@ -37,27 +37,6 @@

    Apache Iceberg™

    The open table format for analytic datasets.


    - - -
      {% for social in config.extra.social %}
    • @@ -331,34 +310,6 @@

      Data Compaction

      src="assets/javascript/termynal.js" data-termynal-container="#termynal|#termynal-data-compaction|#termynal-expressive-sql|#termynal-time-travel"> - - - {% endblock %} {% block content %} From 6869adba2b869150778ce42d288c85a614d18f86 Mon Sep 17 00:00:00 2001 From: Anoop Johnson Date: Wed, 29 Apr 2026 21:31:20 -0700 Subject: [PATCH 131/197] Core: Add builders for v4 structs (#16092) Co-authored-by: Eduard Tudenhoefner --- .../apache/iceberg/DeletionVectorStruct.java | 50 +++ .../apache/iceberg/ManifestInfoStruct.java | 157 ++++++++++ .../org/apache/iceberg/TrackingStruct.java | 102 +++++- .../iceberg/TestDeletionVectorStruct.java | 89 ++++-- .../iceberg/TestManifestInfoStruct.java | 296 +++++++++++++++--- .../apache/iceberg/TestTrackedFileStruct.java | 67 ++-- .../apache/iceberg/TestTrackingStruct.java | 87 ++--- 7 files changed, 705 insertions(+), 143 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java b/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java index 389036ce237b..0eb7c2fe1eb6 100644 --- a/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java +++ b/core/src/main/java/org/apache/iceberg/DeletionVectorStruct.java @@ -21,6 +21,7 @@ import java.io.Serializable; import org.apache.iceberg.avro.SupportsIndexProjection; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.types.Types; /** Mutable {@link StructLike} implementation of {@link DeletionVector}. */ @@ -49,6 +50,14 @@ private DeletionVectorStruct(DeletionVectorStruct toCopy) { this.cardinality = toCopy.cardinality; } + private DeletionVectorStruct(String location, long offset, long sizeInBytes, long cardinality) { + super(BASE_TYPE, BASE_TYPE); + this.location = location; + this.offset = offset; + this.sizeInBytes = sizeInBytes; + this.cardinality = cardinality; + } + @Override public String location() { return location; @@ -115,6 +124,10 @@ protected void internalSet(int pos, T value) { } } + static Builder builder() { + return new Builder(); + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -124,4 +137,41 @@ public String toString() { .add("cardinality", cardinality) .toString(); } + + static class Builder { + private String location = null; + private long offset = -1L; + private long sizeInBytes = -1L; + private long cardinality = -1L; + + Builder location(String dvLocation) { + this.location = dvLocation; + return this; + } + + Builder offset(long dvOffset) { + this.offset = dvOffset; + return this; + } + + Builder sizeInBytes(long dvSizeInBytes) { + this.sizeInBytes = dvSizeInBytes; + return this; + } + + Builder cardinality(long dvCardinality) { + this.cardinality = dvCardinality; + return this; + } + + DeletionVectorStruct build() { + Preconditions.checkArgument(location != null, "Invalid location: null"); + Preconditions.checkArgument(offset >= 0, "Invalid offset: %s (must be >= 0)", offset); + Preconditions.checkArgument( + sizeInBytes >= 0, "Invalid size in bytes: %s (must be >= 0)", sizeInBytes); + Preconditions.checkArgument( + cardinality >= 0, "Invalid cardinality: %s (must be >= 0)", cardinality); + return new DeletionVectorStruct(location, offset, sizeInBytes, cardinality); + } + } } diff --git a/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java b/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java index 8f51df749e33..922047bffedd 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java +++ b/core/src/main/java/org/apache/iceberg/ManifestInfoStruct.java @@ -23,6 +23,7 @@ import java.util.Arrays; import org.apache.iceberg.avro.SupportsIndexProjection; import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; @@ -73,6 +74,32 @@ private ManifestInfoStruct(ManifestInfoStruct toCopy) { this.dvCardinality = toCopy.dvCardinality; } + private ManifestInfoStruct( + int addedFilesCount, + int existingFilesCount, + int deletedFilesCount, + int replacedFilesCount, + long addedRowsCount, + long existingRowsCount, + long deletedRowsCount, + long replacedRowsCount, + long minSequenceNumber, + byte[] dv, + Long dvCardinality) { + super(BASE_TYPE, BASE_TYPE); + this.addedFilesCount = addedFilesCount; + this.existingFilesCount = existingFilesCount; + this.deletedFilesCount = deletedFilesCount; + this.replacedFilesCount = replacedFilesCount; + this.addedRowsCount = addedRowsCount; + this.existingRowsCount = existingRowsCount; + this.deletedRowsCount = deletedRowsCount; + this.replacedRowsCount = replacedRowsCount; + this.minSequenceNumber = minSequenceNumber; + this.dv = dv; + this.dvCardinality = dvCardinality; + } + @Override public int addedFilesCount() { return addedFilesCount; @@ -208,6 +235,10 @@ protected void internalSet(int pos, T value) { } } + static Builder builder() { + return new Builder(); + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -224,4 +255,130 @@ public String toString() { .add("dv_cardinality", dvCardinality == null ? "null" : dvCardinality) .toString(); } + + static class Builder { + private int addedFilesCount = -1; + private int existingFilesCount = -1; + private int deletedFilesCount = -1; + private int replacedFilesCount = -1; + private long addedRowsCount = -1L; + private long existingRowsCount = -1L; + private long deletedRowsCount = -1L; + private long replacedRowsCount = -1L; + private long minSequenceNumber = -1L; + private byte[] dv = null; + private Long dvCardinality = null; + + Builder addedFilesCount(int count) { + this.addedFilesCount = count; + return this; + } + + Builder existingFilesCount(int count) { + this.existingFilesCount = count; + return this; + } + + Builder deletedFilesCount(int count) { + this.deletedFilesCount = count; + return this; + } + + Builder replacedFilesCount(int count) { + this.replacedFilesCount = count; + return this; + } + + Builder addedRowsCount(long count) { + this.addedRowsCount = count; + return this; + } + + Builder existingRowsCount(long count) { + this.existingRowsCount = count; + return this; + } + + Builder deletedRowsCount(long count) { + this.deletedRowsCount = count; + return this; + } + + Builder replacedRowsCount(long count) { + this.replacedRowsCount = count; + return this; + } + + Builder minSequenceNumber(long sequenceNumber) { + this.minSequenceNumber = sequenceNumber; + return this; + } + + Builder dv(ByteBuffer buffer) { + this.dv = buffer != null ? ByteBuffers.toByteArray(buffer) : null; + return this; + } + + Builder dv(byte[] buffer) { + this.dv = buffer; + return this; + } + + Builder dvCardinality(Long cardinality) { + this.dvCardinality = cardinality; + return this; + } + + ManifestInfoStruct build() { + Preconditions.checkArgument( + addedFilesCount >= 0, "Invalid added files count: %s (must be >= 0)", addedFilesCount); + Preconditions.checkArgument( + existingFilesCount >= 0, + "Invalid existing files count: %s (must be >= 0)", + existingFilesCount); + Preconditions.checkArgument( + deletedFilesCount >= 0, + "Invalid deleted files count: %s (must be >= 0)", + deletedFilesCount); + Preconditions.checkArgument( + replacedFilesCount >= 0, + "Invalid replaced files count: %s (must be >= 0)", + replacedFilesCount); + Preconditions.checkArgument( + addedRowsCount >= 0, "Invalid added rows count: %s (must be >= 0)", addedRowsCount); + Preconditions.checkArgument( + existingRowsCount >= 0, + "Invalid existing rows count: %s (must be >= 0)", + existingRowsCount); + Preconditions.checkArgument( + deletedRowsCount >= 0, "Invalid deleted rows count: %s (must be >= 0)", deletedRowsCount); + Preconditions.checkArgument( + replacedRowsCount >= 0, + "Invalid replaced rows count: %s (must be >= 0)", + replacedRowsCount); + Preconditions.checkArgument( + minSequenceNumber >= 0, + "Invalid min sequence number: %s (must be >= 0)", + minSequenceNumber); + Preconditions.checkArgument( + (dv == null) == (dvCardinality == null), + "Invalid DV and cardinality: must both be null or non-null"); + Preconditions.checkArgument( + dvCardinality == null || dvCardinality > 0, + "Invalid DV cardinality: %s (must be positive)", + dvCardinality); + return new ManifestInfoStruct( + addedFilesCount, + existingFilesCount, + deletedFilesCount, + replacedFilesCount, + addedRowsCount, + existingRowsCount, + deletedRowsCount, + replacedRowsCount, + minSequenceNumber, + dv, + dvCardinality); + } + } } diff --git a/core/src/main/java/org/apache/iceberg/TrackingStruct.java b/core/src/main/java/org/apache/iceberg/TrackingStruct.java index a8624aad15c1..65513c8d4a7c 100644 --- a/core/src/main/java/org/apache/iceberg/TrackingStruct.java +++ b/core/src/main/java/org/apache/iceberg/TrackingStruct.java @@ -59,10 +59,6 @@ class TrackingStruct extends SupportsIndexProjection implements Tracking, Serial super(BASE_TYPE, type); } - TrackingStruct() { - super(BASE_TYPE.fields().size()); - } - private TrackingStruct(TrackingStruct toCopy) { super(toCopy); this.status = toCopy.status; @@ -83,6 +79,26 @@ private TrackingStruct(TrackingStruct toCopy) { this.manifestPos = toCopy.manifestPos; } + private TrackingStruct( + EntryStatus status, + Long snapshotId, + Long dataSequenceNumber, + Long fileSequenceNumber, + Long dvSnapshotId, + Long firstRowId, + byte[] deletedPositions, + byte[] replacedPositions) { + super(BASE_TYPE, BASE_TYPE); + this.status = status; + this.snapshotId = snapshotId; + this.dataSequenceNumber = dataSequenceNumber; + this.fileSequenceNumber = fileSequenceNumber; + this.dvSnapshotId = dvSnapshotId; + this.firstRowId = firstRowId; + this.deletedPositions = deletedPositions; + this.replacedPositions = replacedPositions; + } + void inheritFrom(Tracking manifestTracking) { if (manifestTracking != null) { if (snapshotId == null) { @@ -233,6 +249,10 @@ protected void internalSet(int pos, T value) { } } + static Builder builder() { + return new Builder(); + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -246,4 +266,78 @@ public String toString() { .add("replaced_positions", replacedPositions == null ? "null" : "(binary)") .toString(); } + + static class Builder { + private EntryStatus status = null; + private Long snapshotId = null; + private Long dataSequenceNumber = null; + private Long fileSequenceNumber = null; + private Long dvSnapshotId = null; + private Long firstRowId = null; + private byte[] deletedPositions = null; + private byte[] replacedPositions = null; + + Builder status(EntryStatus entryStatus) { + this.status = entryStatus; + return this; + } + + Builder snapshotId(Long id) { + this.snapshotId = id; + return this; + } + + Builder dataSequenceNumber(Long sequenceNumber) { + this.dataSequenceNumber = sequenceNumber; + return this; + } + + Builder fileSequenceNumber(Long sequenceNumber) { + this.fileSequenceNumber = sequenceNumber; + return this; + } + + Builder dvSnapshotId(Long id) { + this.dvSnapshotId = id; + return this; + } + + Builder firstRowId(Long rowId) { + this.firstRowId = rowId; + return this; + } + + Builder deletedPositions(ByteBuffer positions) { + this.deletedPositions = positions != null ? ByteBuffers.toByteArray(positions) : null; + return this; + } + + Builder deletedPositions(byte[] positions) { + this.deletedPositions = positions; + return this; + } + + Builder replacedPositions(ByteBuffer positions) { + this.replacedPositions = positions != null ? ByteBuffers.toByteArray(positions) : null; + return this; + } + + Builder replacedPositions(byte[] positions) { + this.replacedPositions = positions; + return this; + } + + TrackingStruct build() { + Preconditions.checkArgument(status != null, "Invalid status: null"); + return new TrackingStruct( + status, + snapshotId, + dataSequenceNumber, + fileSequenceNumber, + dvSnapshotId, + firstRowId, + deletedPositions, + replacedPositions); + } + } } diff --git a/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java b/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java index 5ab6b1f3586c..325f9afd9ca9 100644 --- a/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java +++ b/core/src/test/java/org/apache/iceberg/TestDeletionVectorStruct.java @@ -19,6 +19,7 @@ package org.apache.iceberg; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.IOException; import org.apache.iceberg.types.Types; @@ -28,12 +29,13 @@ class TestDeletionVectorStruct { @Test void testFieldAccess() { - DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); - - dv.set(0, "s3://bucket/data/dv.puffin"); - dv.set(1, 256L); - dv.set(2, 128L); - dv.set(3, 42L); + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); assertThat(dv.location()).isEqualTo("s3://bucket/data/dv.puffin"); assertThat(dv.offset()).isEqualTo(256L); @@ -43,12 +45,13 @@ void testFieldAccess() { @Test void testCopy() { - DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); - - dv.set(0, "s3://bucket/data/dv.puffin"); - dv.set(1, 256L); - dv.set(2, 128L); - dv.set(3, 42L); + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); DeletionVectorStruct copy = dv.copy(); @@ -86,11 +89,13 @@ void testProjectedStructLike() { @Test void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { - DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); - dv.set(0, "s3://bucket/data/dv.puffin"); - dv.set(1, 256L); - dv.set(2, 128L); - dv.set(3, 42L); + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); DeletionVectorStruct deserialized = TestHelpers.roundTripSerialize(dv); @@ -100,13 +105,53 @@ void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException assertThat(deserialized.cardinality()).isEqualTo(42L); } + @Test + void testBuilderValidation() { + assertThatThrownBy( + () -> DeletionVectorStruct.builder().offset(0).sizeInBytes(1).cardinality(1).build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid location: null"); + + assertThatThrownBy( + () -> + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .sizeInBytes(1) + .cardinality(1) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid offset: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(0) + .cardinality(1) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid size in bytes: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(0) + .sizeInBytes(1) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid cardinality: -1 (must be >= 0)"); + } + @Test void testKryoSerializationRoundTrip() throws IOException { - DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); - dv.set(0, "s3://bucket/data/dv.puffin"); - dv.set(1, 256L); - dv.set(2, 128L); - dv.set(3, 42L); + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/data/dv.puffin") + .offset(256L) + .sizeInBytes(128L) + .cardinality(42L) + .build(); DeletionVectorStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(dv); diff --git a/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java b/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java index 23917de9cd40..3a694f1a38f2 100644 --- a/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java +++ b/core/src/test/java/org/apache/iceberg/TestManifestInfoStruct.java @@ -19,6 +19,7 @@ package org.apache.iceberg; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.IOException; import java.nio.ByteBuffer; @@ -58,19 +59,20 @@ void testFieldAccess() { @Test void testCopy() { - ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); - - info.set(0, 10); - info.set(1, 20); - info.set(2, 3); - info.set(3, 2); - info.set(4, 1000L); - info.set(5, 2000L); - info.set(6, 300L); - info.set(7, 200L); - info.set(8, 5L); - info.set(9, ByteBuffer.wrap(new byte[] {0xF})); - info.set(10, 1L); + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .dv(new byte[] {0xF}) + .dvCardinality(1L) + .build(); ManifestInfoStruct copy = info.copy(); @@ -91,17 +93,18 @@ void testCopy() { @Test void testNullableFields() { - ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); - - info.set(0, 0); - info.set(1, 0); - info.set(2, 0); - info.set(3, 0); - info.set(4, 0L); - info.set(5, 0L); - info.set(6, 0L); - info.set(7, 0L); - info.set(8, 0L); + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build(); assertThat(info.dv()).isNull(); assertThat(info.dvCardinality()).isNull(); @@ -129,18 +132,20 @@ void testProjectedStructLike() { @Test void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { - ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); - info.set(0, 10); - info.set(1, 20); - info.set(2, 3); - info.set(3, 2); - info.set(4, 1000L); - info.set(5, 2000L); - info.set(6, 300L); - info.set(7, 200L); - info.set(8, 5L); - info.set(9, ByteBuffer.wrap(new byte[] {0xF})); - info.set(10, 1L); + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .dv(new byte[] {0xF}) + .dvCardinality(1L) + .build(); ManifestInfoStruct deserialized = TestHelpers.roundTripSerialize(info); @@ -157,20 +162,215 @@ void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException assertThat(deserialized.dvCardinality()).isEqualTo(1L); } + @Test + void testBuilderValidation() { + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid added files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid existing files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid deleted files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid replaced files count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid added rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid existing rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid deleted rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .minSequenceNumber(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid replaced rows count: -1 (must be >= 0)"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid min sequence number: -1 (must be >= 0)"); + } + + @Test + void testBuilderDvPairingValidation() { + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .dv(new byte[] {0xF}) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid DV and cardinality: must both be null or non-null"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .dvCardinality(1L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid DV and cardinality: must both be null or non-null"); + + assertThatThrownBy( + () -> + ManifestInfoStruct.builder() + .addedFilesCount(0) + .existingFilesCount(0) + .deletedFilesCount(0) + .replacedFilesCount(0) + .addedRowsCount(0L) + .existingRowsCount(0L) + .deletedRowsCount(0L) + .replacedRowsCount(0L) + .minSequenceNumber(0L) + .dv(new byte[] {0xF}) + .dvCardinality(0L) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid DV cardinality: 0 (must be positive)"); + } + @Test void testKryoSerializationRoundTrip() throws IOException { - ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); - info.set(0, 10); - info.set(1, 20); - info.set(2, 3); - info.set(3, 2); - info.set(4, 1000L); - info.set(5, 2000L); - info.set(6, 300L); - info.set(7, 200L); - info.set(8, 5L); - info.set(9, ByteBuffer.wrap(new byte[] {0xF})); - info.set(10, 1L); + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .dv(new byte[] {0xF}) + .dvCardinality(1L) + .build(); ManifestInfoStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(info); diff --git a/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java index 05013ae54e79..62324e5607ef 100644 --- a/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java +++ b/core/src/test/java/org/apache/iceberg/TestTrackedFileStruct.java @@ -33,27 +33,27 @@ class TestTrackedFileStruct { @Test void testFieldAccess() { TrackedFileStruct file = new TrackedFileStruct(); - TrackingStruct tracking = new TrackingStruct(); - DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); - ManifestInfoStruct info = new ManifestInfoStruct(ManifestInfo.schema()); - - tracking.set(0, EntryStatus.ADDED.id()); - tracking.set(1, 42L); - - dv.set(0, "s3://bucket/dv.puffin"); - dv.set(1, 100L); - dv.set(2, 50L); - dv.set(3, 5L); - - info.set(0, 10); - info.set(1, 20); - info.set(2, 3); - info.set(3, 2); - info.set(4, 1000L); - info.set(5, 2000L); - info.set(6, 300L); - info.set(7, 200L); - info.set(8, 5L); + TrackingStruct tracking = + TrackingStruct.builder().status(EntryStatus.ADDED).snapshotId(42L).build(); + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(100L) + .sizeInBytes(50L) + .cardinality(5L) + .build(); + ManifestInfoStruct info = + ManifestInfoStruct.builder() + .addedFilesCount(10) + .existingFilesCount(20) + .deletedFilesCount(3) + .replacedFilesCount(2) + .addedRowsCount(1000L) + .existingRowsCount(2000L) + .deletedRowsCount(300L) + .replacedRowsCount(200L) + .minSequenceNumber(5L) + .build(); file.set(0, tracking); file.set(1, FileContent.EQUALITY_DELETES.id()); @@ -90,8 +90,7 @@ void testFieldAccess() { void testReaderSideFields() { TrackedFileStruct file = new TrackedFileStruct(); - TrackingStruct tracking = new TrackingStruct(); - tracking.set(0, EntryStatus.ADDED.id()); + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); tracking.setManifestLocation("s3://bucket/metadata/manifest.avro"); tracking.set(8, 7L); @@ -279,18 +278,22 @@ void testKryoSerializationRoundTrip() throws IOException { } static TrackedFileStruct createFullTrackedFile() { - TrackingStruct tracking = new TrackingStruct(); - tracking.set(0, EntryStatus.ADDED.id()); - tracking.set(1, 42L); - tracking.set(2, 10L); + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .build(); tracking.setManifestLocation("s3://bucket/manifest.avro"); tracking.set(8, 3L); - DeletionVectorStruct dv = new DeletionVectorStruct(DeletionVector.schema()); - dv.set(0, "s3://bucket/dv.puffin"); - dv.set(1, 100L); - dv.set(2, 50L); - dv.set(3, 5L); + DeletionVectorStruct dv = + DeletionVectorStruct.builder() + .location("s3://bucket/dv.puffin") + .offset(100L) + .sizeInBytes(50L) + .cardinality(5L) + .build(); TrackedFileStruct file = new TrackedFileStruct( diff --git a/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java index 5af41d0dcf02..98a7eff2af45 100644 --- a/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java +++ b/core/src/test/java/org/apache/iceberg/TestTrackingStruct.java @@ -53,12 +53,13 @@ void testFieldAccess() { @Test void testCopy() { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - - tracking.set(0, EntryStatus.ADDED.id()); - tracking.set(1, 42L); - tracking.set(2, 10L); - tracking.set(6, ByteBuffer.wrap(new byte[] {1, 2})); + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .deletedPositions(new byte[] {1, 2}) + .build(); TrackingStruct copy = tracking.copy(); @@ -98,8 +99,7 @@ void testIsLive() { @Test void testInheritSnapshotId() { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.ADDED.id()); + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); tracking.inheritFrom(createManifestTracking(100L, 60L)); // snapshotId is null, should inherit from manifest @@ -108,8 +108,7 @@ void testInheritSnapshotId() { @Test void testInheritSequenceNumberForAddedEntries() { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.ADDED.id()); + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); tracking.inheritFrom(createManifestTracking(100L, 60L)); // sequence numbers are null and status is ADDED, should inherit @@ -119,10 +118,12 @@ void testInheritSequenceNumberForAddedEntries() { @Test void testDoNotInheritSequenceNumberForExistingEntries() { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.EXISTING.id()); - tracking.set(2, 5L); - tracking.set(3, 6L); + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.EXISTING) + .dataSequenceNumber(5L) + .fileSequenceNumber(6L) + .build(); tracking.inheritFrom(createManifestTracking(100L, 60L)); // sequence numbers are not inherited for EXISTING entries @@ -132,11 +133,13 @@ void testDoNotInheritSequenceNumberForExistingEntries() { @Test void testExplicitValuesOverrideInheritance() { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.ADDED.id()); - tracking.set(1, 200L); - tracking.set(2, 75L); - tracking.set(3, 76L); + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(200L) + .dataSequenceNumber(75L) + .fileSequenceNumber(76L) + .build(); tracking.inheritFrom(createManifestTracking(100L, 60L)); // explicit values should take precedence @@ -163,8 +166,7 @@ void testInheritFromRejectsUnequalSequenceNumbers() { @Test void testNoDefaultingWithoutInheritance() { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.ADDED.id()); + TrackingStruct tracking = TrackingStruct.builder().status(EntryStatus.ADDED).build(); // no inheritance, nulls stay null assertThat(tracking.snapshotId()).isNull(); @@ -173,12 +175,19 @@ void testNoDefaultingWithoutInheritance() { } private static Tracking createManifestTracking(long snapshotId, long sequenceNumber) { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.ADDED.id()); - tracking.set(1, snapshotId); - tracking.set(2, sequenceNumber); - tracking.set(3, sequenceNumber); - return tracking; + return TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(snapshotId) + .dataSequenceNumber(sequenceNumber) + .fileSequenceNumber(sequenceNumber) + .build(); + } + + @Test + void testBuilderValidation() { + assertThatThrownBy(() -> TrackingStruct.builder().build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid status: null"); } @Test @@ -202,11 +211,13 @@ void testProjectedStructLike() { @Test void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.ADDED.id()); - tracking.set(1, 42L); - tracking.set(2, 10L); - tracking.set(6, ByteBuffer.wrap(new byte[] {1, 2})); + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .deletedPositions(new byte[] {1, 2}) + .build(); TrackingStruct deserialized = TestHelpers.roundTripSerialize(tracking); @@ -218,11 +229,13 @@ void testJavaSerializationRoundTrip() throws IOException, ClassNotFoundException @Test void testKryoSerializationRoundTrip() throws IOException { - TrackingStruct tracking = new TrackingStruct(Tracking.schema()); - tracking.set(0, EntryStatus.ADDED.id()); - tracking.set(1, 42L); - tracking.set(2, 10L); - tracking.set(6, ByteBuffer.wrap(new byte[] {1, 2})); + TrackingStruct tracking = + TrackingStruct.builder() + .status(EntryStatus.ADDED) + .snapshotId(42L) + .dataSequenceNumber(10L) + .deletedPositions(new byte[] {1, 2}) + .build(); TrackingStruct deserialized = TestHelpers.KryoHelpers.roundTripSerialize(tracking); From 54c6433cbd08457cb43f8968efa5cb26917a3bef Mon Sep 17 00:00:00 2001 From: Anupam Yadav Date: Thu, 30 Apr 2026 04:55:51 -0700 Subject: [PATCH 132/197] Flink: Fix JdbcLockFactory to allow ClientPoolImpl connection retry (#16049) --- .../maintenance/api/JdbcLockFactory.java | 7 -- .../maintenance/api/TestJdbcLockFactory.java | 65 +++++++++++++++++++ .../maintenance/api/JdbcLockFactory.java | 7 -- .../maintenance/api/TestJdbcLockFactory.java | 65 +++++++++++++++++++ .../maintenance/api/JdbcLockFactory.java | 7 -- .../maintenance/api/TestJdbcLockFactory.java | 65 +++++++++++++++++++ 6 files changed, 195 insertions(+), 21 deletions(-) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java index f68605accc57..30e95b1edba0 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java @@ -260,10 +260,6 @@ public void unlock() { this, instanceId, count); - } catch (SQLException e) { - // SQL exception happened when deleting lock information - throw new UncheckedSQLException( - e, "Failed to delete %s lock with instanceId %s", this, instanceId); } return null; @@ -298,9 +294,6 @@ private String instanceId() { return null; } } - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); } }); } catch (InterruptedException e) { diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java index 3cb18ffbb77e..4d35792e440e 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java @@ -19,11 +19,18 @@ package org.apache.iceberg.flink.maintenance.api; import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.sql.SQLTransientConnectionException; import java.util.Map; import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.jdbc.JdbcCatalog; +import org.apache.iceberg.jdbc.JdbcClientPool; +import org.apache.iceberg.jdbc.UncheckedSQLException; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; class TestJdbcLockFactory extends TestLockFactoryBase { @Override @@ -38,4 +45,62 @@ TriggerLockFactory lockFactory(String tableName) { tableName, properties); } + + @Test + void testSQLExceptionEnablesRetryInClientPool() throws Exception { + // Regression test for #15759: verify that removing the inner try-catch allows + // ClientPoolImpl to retry on transient connection failures. + // + // Before the fix: inner catch converted SQLException -> UncheckedSQLException + // (RuntimeException) inside the lambda. ClientPoolImpl only catches the declared + // exception type (SQLException), so RuntimeException bypasses retry entirely. + // After the fix: SQLException propagates naturally, ClientPoolImpl catches it, + // and retries on transient connection exceptions. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + AtomicInteger attempts = new AtomicInteger(0); + + String result = + pool.run( + conn -> { + if (attempts.incrementAndGet() == 1) { + throw new SQLTransientConnectionException("transient failure"); + } + + return "success"; + }); + + assertThat(result).isEqualTo("success"); + assertThat(attempts.get()).isGreaterThan(1); + } + } + + @Test + void testUncheckedSQLExceptionBypassesRetry() throws Exception { + // Companion test: demonstrates that wrapping SQLException as UncheckedSQLException + // (the OLD behavior before the fix) prevents ClientPoolImpl from retrying. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + assertThatThrownBy( + () -> + pool.run( + conn -> { + try { + throw new SQLTransientConnectionException("transient failure"); + } catch (java.sql.SQLException e) { + throw new UncheckedSQLException(e, "wrapped"); + } + })) + .isInstanceOf(UncheckedSQLException.class) + .hasMessageContaining("wrapped"); + } + } } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java index f68605accc57..30e95b1edba0 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java @@ -260,10 +260,6 @@ public void unlock() { this, instanceId, count); - } catch (SQLException e) { - // SQL exception happened when deleting lock information - throw new UncheckedSQLException( - e, "Failed to delete %s lock with instanceId %s", this, instanceId); } return null; @@ -298,9 +294,6 @@ private String instanceId() { return null; } } - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); } }); } catch (InterruptedException e) { diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java index 3cb18ffbb77e..4d35792e440e 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java @@ -19,11 +19,18 @@ package org.apache.iceberg.flink.maintenance.api; import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.sql.SQLTransientConnectionException; import java.util.Map; import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.jdbc.JdbcCatalog; +import org.apache.iceberg.jdbc.JdbcClientPool; +import org.apache.iceberg.jdbc.UncheckedSQLException; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; class TestJdbcLockFactory extends TestLockFactoryBase { @Override @@ -38,4 +45,62 @@ TriggerLockFactory lockFactory(String tableName) { tableName, properties); } + + @Test + void testSQLExceptionEnablesRetryInClientPool() throws Exception { + // Regression test for #15759: verify that removing the inner try-catch allows + // ClientPoolImpl to retry on transient connection failures. + // + // Before the fix: inner catch converted SQLException -> UncheckedSQLException + // (RuntimeException) inside the lambda. ClientPoolImpl only catches the declared + // exception type (SQLException), so RuntimeException bypasses retry entirely. + // After the fix: SQLException propagates naturally, ClientPoolImpl catches it, + // and retries on transient connection exceptions. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + AtomicInteger attempts = new AtomicInteger(0); + + String result = + pool.run( + conn -> { + if (attempts.incrementAndGet() == 1) { + throw new SQLTransientConnectionException("transient failure"); + } + + return "success"; + }); + + assertThat(result).isEqualTo("success"); + assertThat(attempts.get()).isGreaterThan(1); + } + } + + @Test + void testUncheckedSQLExceptionBypassesRetry() throws Exception { + // Companion test: demonstrates that wrapping SQLException as UncheckedSQLException + // (the OLD behavior before the fix) prevents ClientPoolImpl from retrying. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + assertThatThrownBy( + () -> + pool.run( + conn -> { + try { + throw new SQLTransientConnectionException("transient failure"); + } catch (java.sql.SQLException e) { + throw new UncheckedSQLException(e, "wrapped"); + } + })) + .isInstanceOf(UncheckedSQLException.class) + .hasMessageContaining("wrapped"); + } + } } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java index f68605accc57..30e95b1edba0 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/maintenance/api/JdbcLockFactory.java @@ -260,10 +260,6 @@ public void unlock() { this, instanceId, count); - } catch (SQLException e) { - // SQL exception happened when deleting lock information - throw new UncheckedSQLException( - e, "Failed to delete %s lock with instanceId %s", this, instanceId); } return null; @@ -298,9 +294,6 @@ private String instanceId() { return null; } } - } catch (SQLException e) { - // SQL exception happened when getting lock information - throw new UncheckedSQLException(e, "Failed to get lock information for %s", type); } }); } catch (InterruptedException e) { diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java index 3cb18ffbb77e..4d35792e440e 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestJdbcLockFactory.java @@ -19,11 +19,18 @@ package org.apache.iceberg.flink.maintenance.api; import static org.apache.iceberg.flink.maintenance.api.JdbcLockFactory.INIT_LOCK_TABLES_PROPERTY; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.sql.SQLTransientConnectionException; import java.util.Map; import java.util.UUID; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.iceberg.jdbc.JdbcCatalog; +import org.apache.iceberg.jdbc.JdbcClientPool; +import org.apache.iceberg.jdbc.UncheckedSQLException; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.junit.jupiter.api.Test; class TestJdbcLockFactory extends TestLockFactoryBase { @Override @@ -38,4 +45,62 @@ TriggerLockFactory lockFactory(String tableName) { tableName, properties); } + + @Test + void testSQLExceptionEnablesRetryInClientPool() throws Exception { + // Regression test for #15759: verify that removing the inner try-catch allows + // ClientPoolImpl to retry on transient connection failures. + // + // Before the fix: inner catch converted SQLException -> UncheckedSQLException + // (RuntimeException) inside the lambda. ClientPoolImpl only catches the declared + // exception type (SQLException), so RuntimeException bypasses retry entirely. + // After the fix: SQLException propagates naturally, ClientPoolImpl catches it, + // and retries on transient connection exceptions. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + AtomicInteger attempts = new AtomicInteger(0); + + String result = + pool.run( + conn -> { + if (attempts.incrementAndGet() == 1) { + throw new SQLTransientConnectionException("transient failure"); + } + + return "success"; + }); + + assertThat(result).isEqualTo("success"); + assertThat(attempts.get()).isGreaterThan(1); + } + } + + @Test + void testUncheckedSQLExceptionBypassesRetry() throws Exception { + // Companion test: demonstrates that wrapping SQLException as UncheckedSQLException + // (the OLD behavior before the fix) prevents ClientPoolImpl from retrying. + Map props = Maps.newHashMap(); + props.put("username", "user"); + props.put("password", "password"); + String uri = "jdbc:sqlite:file::memory:?ic" + UUID.randomUUID().toString().replace("-", ""); + + try (JdbcClientPool pool = new JdbcClientPool(1, uri, props)) { + assertThatThrownBy( + () -> + pool.run( + conn -> { + try { + throw new SQLTransientConnectionException("transient failure"); + } catch (java.sql.SQLException e) { + throw new UncheckedSQLException(e, "wrapped"); + } + })) + .isInstanceOf(UncheckedSQLException.class) + .hasMessageContaining("wrapped"); + } + } } From 1bdbed7a5ddd3981dd958b8a6e8124ca56c5a0e4 Mon Sep 17 00:00:00 2001 From: Swapna Marru Date: Thu, 30 Apr 2026 05:03:59 -0700 Subject: [PATCH 133/197] Flink: SQL: Make Dynamic sink options to be configurable in SQL (#15780) --- .../apache/iceberg/flink/FlinkConfParser.java | 2 +- .../sink/dynamic/DynamicIcebergSink.java | 71 ++++++----- .../flink/sink/dynamic/DynamicRecord.java | 4 + .../sink/dynamic/DynamicRecordProcessor.java | 36 ++++-- .../sink/dynamic/DynamicRecordWithConfig.java | 94 ++++++++++++++ .../dynamic/DynamicTableUpdateOperator.java | 19 +-- .../sink/dynamic/FlinkDynamicSinkConf.java | 102 +++++++++++++++ .../sink/dynamic/FlinkDynamicSinkOptions.java | 71 +++++++++++ .../flink/sink/dynamic/HashKeyGenerator.java | 5 +- .../sink/dynamic/TestDynamicIcebergSink.java | 105 +++++++++++++-- .../dynamic/TestDynamicRecordWithConfig.java | 120 ++++++++++++++++++ .../TestDynamicTableUpdateOperator.java | 56 +++----- .../sink/dynamic/TestHashKeyGenerator.java | 63 ++++++++- 13 files changed, 631 insertions(+), 117 deletions(-) create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java create mode 100644 flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e0672811cf5f..7661372c88e8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -44,7 +44,7 @@ public FlinkConfParser(Table table, Map options, ReadableConfig this.readableConfig = readableConfig; } - FlinkConfParser(Map options, ReadableConfig readableConfig) { + public FlinkConfParser(Map options, ReadableConfig readableConfig) { this.tableProperties = ImmutableMap.of(); this.options = options; this.readableConfig = readableConfig; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 7b0de6fbe9e3..ad430cbf13f8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -235,12 +235,6 @@ public static class Builder { private final Map snapshotSummary = Maps.newHashMap(); private ReadableConfig readableConfig = new Configuration(); private TableCreator tableCreator = TableCreator.DEFAULT; - private boolean immediateUpdate = false; - private boolean dropUnusedColumns = false; - private int cacheMaximumSize = 100; - private long cacheRefreshMs = 1_000; - private int inputSchemasPerTableCacheMaximumSize = 10; - private boolean caseSensitive = true; Builder() {} @@ -361,7 +355,9 @@ public Builder toBranch(String branch) { } public Builder immediateTableUpdate(boolean newImmediateUpdate) { - this.immediateUpdate = newImmediateUpdate; + writeOptions.put( + FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key(), + Boolean.toString(newImmediateUpdate)); return this; } @@ -377,19 +373,21 @@ public Builder immediateTableUpdate(boolean newImmediateUpdate) { * will never return data of the old column. */ public Builder dropUnusedColumns(boolean newDropUnusedColumns) { - this.dropUnusedColumns = newDropUnusedColumns; + writeOptions.put( + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), + Boolean.toString(newDropUnusedColumns)); return this; } /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ public Builder cacheMaxSize(int maxSize) { - this.cacheMaximumSize = maxSize; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key(), Integer.toString(maxSize)); return this; } /** Maximum interval for cache items renewals. */ public Builder cacheRefreshMs(long refreshMs) { - this.cacheRefreshMs = refreshMs; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key(), Long.toString(refreshMs)); return this; } @@ -399,7 +397,9 @@ public Builder cacheRefreshMs(long refreshMs) { * comparison results. */ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; + writeOptions.put( + FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key(), + Integer.toString(inputSchemasPerTableCacheMaxSize)); return this; } @@ -408,7 +408,8 @@ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCache * field names case-sensitive. */ public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; + writeOptions.put( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), Boolean.toString(newCaseSensitive)); return this; } @@ -424,14 +425,14 @@ private DynamicIcebergSink build( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - Configuration flinkConfig = - readableConfig instanceof Configuration - ? (Configuration) readableConfig - : Configuration.fromMap(readableConfig.toMap()); + Configuration flinkConfig = fromReadableConfig(); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, flinkConfig); // Forward writer: chained with generator via forward edge, no data shuffle ForwardWriterSink forwardWriterSink = - new ForwardWriterSink(catalogLoader, writeOptions, flinkConfig, cacheMaximumSize); + new ForwardWriterSink( + catalogLoader, writeOptions, flinkConfig, flinkDynamicSinkConf.cacheMaxSize()); TypeInformation> writeResultTypeInfo = CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); @@ -455,13 +456,15 @@ DynamicIcebergSink instantiateSink( Map writeProperties, Configuration flinkWriteConf, DataStream> forwardWriteResults) { + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeProperties, flinkWriteConf); return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize, + flinkDynamicSinkConf.cacheMaxSize(), forwardWriteResults); } @@ -485,10 +488,14 @@ DynamicIcebergSink instantiateSink( public DataStreamSink append() { uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, readableConfig); + Configuration flinkConfig = fromReadableConfig(); + DynamicRecordInternalType type = - new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); + new DynamicRecordInternalType(catalogLoader, false, flinkDynamicSinkConf.cacheMaxSize()); DynamicRecordInternalType sideOutputType = - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize); + new DynamicRecordInternalType(catalogLoader, true, flinkDynamicSinkConf.cacheMaxSize()); SingleOutputStreamOperator converted = input @@ -496,13 +503,10 @@ public DataStreamSink append() { new DynamicRecordProcessor<>( generator, catalogLoader, - immediateUpdate, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, tableCreator, - caseSensitive, - dropUnusedColumns)) + flinkDynamicSinkConf, + writeOptions, + flinkConfig)) .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) @@ -518,14 +522,7 @@ public DataStreamSink append() { DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( - new DynamicTableUpdateOperator( - catalogLoader, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, - tableCreator, - caseSensitive, - dropUnusedColumns)) + new DynamicTableUpdateOperator(catalogLoader, tableCreator, flinkDynamicSinkConf)) .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) @@ -543,6 +540,12 @@ public DataStreamSink append() { return result; } + + private Configuration fromReadableConfig() { + return readableConfig instanceof Configuration + ? (Configuration) readableConfig + : Configuration.fromMap(readableConfig.toMap()); + } } DataStream distributeDataStream(DataStream input) { diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 15b83a589382..6507a575c2af 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -20,6 +20,7 @@ import java.util.Set; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.table.data.RowData; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.PartitionSpec; @@ -39,6 +40,9 @@ public class DynamicRecord { private boolean upsertMode; @Nullable private Set equalityFields; + @Internal + DynamicRecord() {} + /** * Constructs a new DynamicRecord with forward (no shuffle) writes. * diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index fc6892b2cd9e..c752b8e9b8d9 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -18,10 +18,12 @@ */ package org.apache.iceberg.flink.sink.dynamic; +import java.util.Map; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.table.data.RowData; import org.apache.flink.util.Collector; @@ -30,6 +32,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkWriteConf; @Internal class DynamicRecordProcessor extends ProcessFunction @@ -41,6 +44,8 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; private final boolean immediateUpdate; private final boolean dropUnusedColumns; private final int cacheMaximumSize; @@ -55,27 +60,27 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; private transient OutputTag forwardStream; private transient Collector collector; + private transient DynamicRecordWithConfig dynamicRecordWithConfig; private transient Context context; DynamicRecordProcessor( DynamicRecordGenerator generator, CatalogLoader catalogLoader, - boolean immediateUpdate, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + FlinkDynamicSinkConf sinkConfig, + Map writeProperties, + Configuration flinkConfig) { this.generator = generator; this.catalogLoader = catalogLoader; - this.immediateUpdate = immediateUpdate; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + this.flinkConfig = flinkConfig; + this.writeProperties = writeProperties; + this.immediateUpdate = sinkConfig.immediateTableUpdate(); + this.cacheMaximumSize = sinkConfig.cacheMaxSize(); + this.cacheRefreshMs = sinkConfig.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = sinkConfig.inputSchemasPerTableCacheMaxSize(); this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + this.caseSensitive = sinkConfig.caseSensitive(); + this.dropUnusedColumns = sinkConfig.dropUnusedColumns(); } @Override @@ -107,6 +112,8 @@ public void open(OpenContext openContext) throws Exception { new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; } + this.dynamicRecordWithConfig = + new DynamicRecordWithConfig(new FlinkWriteConf(writeProperties, flinkConfig)); generator.open(openContext); } @@ -119,9 +126,10 @@ public void processElement(T element, Context ctx, Collector 0 || defaultWriteParallelism == null) { + return originalParallelism; + } + + return defaultWriteParallelism; + } + + @Override + public TableIdentifier tableIdentifier() { + return wrapped.tableIdentifier(); + } + + @Override + public Schema schema() { + return wrapped.schema(); + } + + @Override + public PartitionSpec spec() { + return wrapped.spec(); + } + + @Override + public RowData rowData() { + return wrapped.rowData(); + } + + @Override + public boolean upsertMode() { + return wrapped.upsertMode(); + } + + @Override + public Set equalityFields() { + return wrapped.equalityFields(); + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java index 456f20adf59f..93c268ff86ad 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java @@ -48,20 +48,15 @@ class DynamicTableUpdateOperator private transient TableUpdater updater; DynamicTableUpdateOperator( - CatalogLoader catalogLoader, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, - TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + CatalogLoader catalogLoader, TableCreator tableCreator, FlinkDynamicSinkConf configuration) { this.catalogLoader = catalogLoader; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + + this.cacheMaximumSize = configuration.cacheMaxSize(); + this.cacheRefreshMs = configuration.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = configuration.inputSchemasPerTableCacheMaxSize(); + this.caseSensitive = configuration.caseSensitive(); + this.dropUnusedColumns = configuration.dropUnusedColumns(); } @Override diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java new file mode 100644 index 000000000000..75b169c4b533 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.flink.FlinkConfParser; + +/** + * A class for common Dynamic Iceberg sink configs for Flink writes. + * + *

      If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

        + *
      1. Write options + *
      2. Flink ReadableConfig + *
      3. Default values + *
      + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the default values. + */ +class FlinkDynamicSinkConf { + + private final FlinkConfParser confParser; + + FlinkDynamicSinkConf(Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(writeOptions, readableConfig); + } + + int cacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean immediateTableUpdate() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key()) + .flinkConfig(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE) + .defaultValue(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.defaultValue()) + .parse(); + } + + boolean dropUnusedColumns() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key()) + .flinkConfig(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS) + .defaultValue(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.defaultValue()) + .parse(); + } + + long cacheRefreshMs() { + return confParser + .longConf() + .option(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_REFRESH_MS) + .defaultValue(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.defaultValue()) + .parse(); + } + + int inputSchemasPerTableCacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.CASE_SENSITIVE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CASE_SENSITIVE) + .defaultValue(FlinkDynamicSinkOptions.CASE_SENSITIVE.defaultValue()) + .parse(); + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java new file mode 100644 index 000000000000..7a4f038219d9 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +@Experimental +public class FlinkDynamicSinkOptions { + + private FlinkDynamicSinkOptions() {} + + public static final ConfigOption CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.cache-max-size") + .intType() + .defaultValue(100) + .withDescription( + "Maximum size of the caches used in Dynamic Sink for table data and serializers."); + + public static final ConfigOption IMMEDIATE_TABLE_UPDATE = + ConfigOptions.key("dynamic-sink.immediate-table-update") + .booleanType() + .defaultValue(false) + .withDescription( + "Controls whether table schema and partition updates should be applied immediately in Dynamic Sink."); + + public static final ConfigOption DROP_UNUSED_COLUMNS = + ConfigOptions.key("dynamic-sink.drop-unused-columns") + .booleanType() + .defaultValue(false) + .withDescription( + "Allows dropping unused columns during schema evolution in Dynamic Sink."); + + public static final ConfigOption CACHE_REFRESH_MS = + ConfigOptions.key("dynamic-sink.cache-refresh-ms") + .longType() + .defaultValue(1_000L) + .withDescription( + "Cache refresh interval for dynamic table metadata in Dynamic Sink in milliseconds."); + + public static final ConfigOption INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.input-schemas-per-table-cache-max-size") + .intType() + .defaultValue(10) + .withDescription( + "Maximum input schema objects to cache per each table in Dynamic Sink for performance."); + + public static final ConfigOption CASE_SENSITIVE = + ConfigOptions.key("dynamic-sink.case-sensitive") + .booleanType() + .defaultValue(true) + .withDescription( + "Controls whether schema field name matching should be case-sensitive in Dynamic Sink."); +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java index fca45bf882e0..61a850212bf4 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java @@ -88,7 +88,7 @@ int generateKey( dynamicRecord.schema(), dynamicRecord.spec(), dynamicRecord.equalityFields(), - MoreObjects.firstNonNull(dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism)); KeySelector keySelector = keySelectorCache.computeIfAbsent( @@ -98,8 +98,7 @@ int generateKey( tableIdent, MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), - MoreObjects.firstNonNull( - dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), MoreObjects.firstNonNull( dynamicRecord.equalityFields(), Collections.emptySet()), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism))); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 4e7511501014..89befb9e8ea2 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -78,6 +78,7 @@ import org.apache.iceberg.flink.CatalogLoader; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestHelpers; @@ -86,6 +87,7 @@ import org.apache.iceberg.flink.sink.dynamic.TestDynamicCommitter.FailBeforeAndAfterCommit; import org.apache.iceberg.inmemory.InMemoryInputFile; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -122,6 +124,7 @@ private static class DynamicIcebergDataImpl implements Serializable { PartitionSpec partitionSpec; boolean upsertMode; Set equalityFields; + int writeParallelism; private DynamicIcebergDataImpl( Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { @@ -133,7 +136,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -150,7 +154,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -169,7 +174,8 @@ private DynamicIcebergDataImpl( partitionSpec, upsertMode, equalityFields, - isDuplicate); + isDuplicate, + 10); } private DynamicIcebergDataImpl( @@ -180,7 +186,8 @@ private DynamicIcebergDataImpl( PartitionSpec partitionSpec, boolean upsertMode, Set equalityFields, - boolean isDuplicate) { + boolean isDuplicate, + int writeParallelism) { this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); this.rowExpected = isDuplicate ? null : rowProvided; this.schemaProvided = schemaProvided; @@ -190,6 +197,7 @@ private DynamicIcebergDataImpl( this.partitionSpec = partitionSpec; this.upsertMode = upsertMode; this.equalityFields = equalityFields; + this.writeParallelism = writeParallelism; } } @@ -209,7 +217,7 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { converter(schema).toInternal(row.rowProvided), spec, spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, - 10); + row.writeParallelism); dynamicRecord.setUpsertMode(row.upsertMode); dynamicRecord.setEqualityFields(row.equalityFields); out.collect(dynamicRecord); @@ -381,6 +389,19 @@ private void runForwardWriteTest(DynamicRecordGenerator verifyResults(rows); } + @Test + void testWriteWithNullBranch() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned())); + + runTest( + rows, this.env, false, 1, ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), "test-branch")); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1369,6 +1390,35 @@ void testGeneratorDefaultParallelism() { assertThat(generatorParallelism).isEqualTo(source.getParallelism()); } + @Test + void testFallBackParallelismFromConfig() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + -1), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + 0)); + + runTest( + rows, this.env, true, 2, ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "1")); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); @@ -1477,6 +1527,18 @@ private void runTest( verifyResults(dynamicData); } + private void runTest( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + Map writeProperties) + throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, null, false, writeProperties); + verifyResults(dynamicData, writeProperties); + } + private void executeDynamicSink( List dynamicData, StreamExecutionEnvironment env, @@ -1484,7 +1546,8 @@ private void executeDynamicSink( int parallelism, @Nullable CommitHook commitHook) throws Exception { - executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, commitHook, false); + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, false, Maps.newHashMap()); } private void executeDynamicSink( @@ -1495,6 +1558,19 @@ private void executeDynamicSink( @Nullable CommitHook commitHook, boolean overwrite) throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, overwrite, Maps.newHashMap()); + } + + private void executeDynamicSink( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + @Nullable CommitHook commitHook, + boolean overwrite, + Map writeProperties) + throws Exception { DataStream dataStream = env.fromData(dynamicData, TypeInformation.of(new TypeHint<>() {})); env.setParallelism(parallelism); @@ -1508,6 +1584,7 @@ private void executeDynamicSink( .immediateTableUpdate(immediateUpdate) .setSnapshotProperty("commit.retry.num-retries", "0") .overwrite(overwrite) + .setAll(writeProperties) .append(); } else { DynamicIcebergSink.forInput(dataStream) @@ -1516,6 +1593,7 @@ private void executeDynamicSink( .writeParallelism(parallelism) .immediateTableUpdate(immediateUpdate) .overwrite(overwrite) + .setAll(writeProperties) .append(); } @@ -1542,7 +1620,6 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100, forwardWriteResults); } } @@ -1559,7 +1636,6 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize, DataStream> forwardWritten) { super( catalogLoader, @@ -1567,7 +1643,7 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize, + 100, forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); @@ -1587,6 +1663,12 @@ public Committer createCommitter(CommitterInitContext contex } private void verifyResults(List dynamicData) throws IOException { + verifyResults(dynamicData, Maps.newHashMap()); + } + + private void verifyResults( + List dynamicData, Map writeProperties) + throws IOException { // Calculate the expected result Map, List> expectedData = Maps.newHashMap(); Map expectedSchema = Maps.newHashMap(); @@ -1600,9 +1682,12 @@ private void verifyResults(List dynamicData) throws IOEx dynamicData.forEach( r -> { + String branch = + MoreObjects.firstNonNull( + r.branch, writeProperties.get(FlinkWriteOptions.BRANCH.key())); List data = expectedData.computeIfAbsent( - Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); + Tuple2.of(r.tableName, branch), unused -> Lists.newArrayList()); data.addAll( convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); }); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java new file mode 100644 index 000000000000..de55621475ed --- /dev/null +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDynamicRecordWithConfig { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("db", "table"); + private static final PartitionSpec UNPARTITIONED = PartitionSpec.unpartitioned(); + private static final RowData ROW_DATA = GenericRowData.of(1, StringData.fromString("test")); + + @Test + void testBranchFallBack() { + String defaultBranch = "default-branch"; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), defaultBranch), new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(defaultBranch); + + String customBranch = "custom-branch"; + dynamicRecord.setBranch(customBranch); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(customBranch); + } + + @Test + void testWriteParallelismFallBack() { + int defaultParallelism = 4; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of( + FlinkWriteOptions.WRITE_PARALLELISM.key(), String.valueOf(defaultParallelism)), + new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED, null, -1); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(0); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(8); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()).isEqualTo(8); + } + + @Test + void testDelegatesToWrappedRecord() { + FlinkWriteConf conf = new FlinkWriteConf(Collections.emptyMap(), new Configuration()); + PartitionSpec partitioned = PartitionSpec.builderFor(SCHEMA).identity("id").build(); + Set equalityFields = ImmutableSet.of("id", "data"); + + DynamicRecord dynamicRecord = + new DynamicRecord( + TABLE_IDENTIFIER, + SnapshotRef.MAIN_BRANCH, + SCHEMA, + ROW_DATA, + partitioned, + DistributionMode.HASH, + 2); + dynamicRecord.setUpsertMode(true); + dynamicRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig record = new DynamicRecordWithConfig(conf).wrap(dynamicRecord); + + assertThat(record.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + assertThat(record.schema()).isEqualTo(SCHEMA); + assertThat(record.spec()).isEqualTo(partitioned); + assertThat(record.rowData()).isSameAs(ROW_DATA); + assertThat(record.distributionMode()).isEqualTo(DistributionMode.HASH); + assertThat(record.upsertMode()).isTrue(); + assertThat(record.equalityFields()).isEqualTo(equalityFields); + } +} diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java index 1c8e6df8591d..f6b2b368c2be 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java @@ -23,12 +23,14 @@ import static org.assertj.core.api.Assertions.assertThat; import java.util.Collections; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.GenericRowData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -57,9 +59,6 @@ class TestDynamicTableUpdateOperator { @Test void testDynamicTableUpdateOperatorNewTable() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -67,12 +66,8 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); DynamicRecordInternal input = @@ -93,21 +88,14 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { @Test void testDynamicTableUpdateOperatorSchemaChange() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA1); @@ -135,9 +123,6 @@ void testDynamicTableUpdateOperatorSchemaChange() throws Exception { @ParameterizedTest @ValueSource(booleans = {true, false}) void testCaseInSensitivity(boolean caseSensitive) throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -148,12 +133,8 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - caseSensitive, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(caseSensitive, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, initialSchema); @@ -187,21 +168,14 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { @Test void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -228,21 +202,14 @@ void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { @Test void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_INSENSITIVE, - DROP_COLUMNS); + flinkDynamicSinkConfiguration(CASE_INSENSITIVE, DROP_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -265,4 +232,13 @@ void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { assertThat(tableSchema.findField("data")).isNull(); assertThat(input).isEqualTo(output); } + + private static FlinkDynamicSinkConf flinkDynamicSinkConfiguration( + boolean caseSensitive, boolean dropUnusedColumns) { + return new FlinkDynamicSinkConf( + ImmutableMap.of( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), String.valueOf(caseSensitive), + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), String.valueOf(dropUnusedColumns)), + new Configuration()); + } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java index c65f96b12cbb..9a485fafaf47 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; @@ -34,6 +35,9 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -229,6 +233,38 @@ void testFailOnNonPositiveWriteParallelism() { }); } + @Test + void testNonPositiveWriteParallelismConfigFallback() throws Exception { + int maxWriteParallelism = 5; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + FlinkWriteConf flinkWriteConf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "2"), new Configuration()); + + Set writeKeys = Sets.newHashSet(); + for (int i = 0; i < 20; i++) { + GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); + writeKeys.add( + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + i % 2 == 0 ? 0 : -1, + Collections.emptySet(), + row, + flinkWriteConf)); + } + + assertThat(writeKeys).hasSize(2); + assertThat( + writeKeys.stream() + .map(key -> getSubTaskId(key, 2, maxWriteParallelism)) + .distinct() + .count()) + .isEqualTo(2); + } + @Test void testCapAtMaxWriteParallelism() throws Exception { int writeParallelism = 10; @@ -477,10 +513,31 @@ private static int getWriteKey( Set equalityFields, GenericRowData row) throws Exception { - DynamicRecord record = + return getWriteKey( + generator, + spec, + mode, + writeParallelism, + equalityFields, + row, + new FlinkWriteConf(Collections.emptyMap(), new Configuration())); + } + + private static int getWriteKey( + HashKeyGenerator generator, + PartitionSpec spec, + DistributionMode mode, + int writeParallelism, + Set equalityFields, + GenericRowData row, + FlinkWriteConf flinkWriteConf) + throws Exception { + DynamicRecord inputRecord = new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); - record.setEqualityFields(equalityFields); - return generator.generateKey(record); + inputRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(flinkWriteConf); + return generator.generateKey(dynamicRecordWithConfig.wrap(inputRecord)); } private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { From 0dab08cd9339a323f051b31ff7925825df3c70d3 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Thu, 30 Apr 2026 07:18:14 -0700 Subject: [PATCH 134/197] Flink: Apply LICENSE changes to older Flink versions. (#16159) --- flink/v1.20/flink-runtime/LICENSE | 406 +++--------------------------- flink/v1.20/flink-runtime/NOTICE | 201 --------------- flink/v2.0/flink-runtime/LICENSE | 406 +++--------------------------- flink/v2.0/flink-runtime/NOTICE | 122 --------- 4 files changed, 78 insertions(+), 1057 deletions(-) diff --git a/flink/v1.20/flink-runtime/LICENSE b/flink/v1.20/flink-runtime/LICENSE index e8c4c4a0bdf7..11460c3307c8 100644 --- a/flink/v1.20/flink-runtime/LICENSE +++ b/flink/v1.20/flink-runtime/LICENSE @@ -227,7 +227,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2010 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,55 +235,57 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). -Copyright: 2002-2014 Sebastiano Vigna -Project URL: http://fastutil.di.unimi.it/ +Copyright: 2013 Daniel Lemire +Project URL: https://github.com/lemire/JavaFastPFOR License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache ORC. +This product bundles Fastutil (bundled by Parquet). -Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://orc.apache.org/ +Copyright: 2002-2014 Sebastiano Vigna +Project URL: http://fastutil.di.unimi.it/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Zero-Allocation Hashing (bundled by Parquet). -Project URL: https://datasketches.apache.org/ +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache ORC. Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://hive.apache.org/ +Project URL: https://orc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Airlift Aircompressor. +This product bundles Apache Hive's Storage API (bundled by ORC). -Copyright: 2011-2020 Aircompressor authors. -Project URL: https://github.com/airlift/aircompressor +Copyright: 2008-2020 The Apache Software Foundation. +Project URL: https://hive.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Google GAX. +This product bundles Google protobuf (bundled by ORC). -Project URL: https://github.com/googleapis/gax-java +Copyright: 2008 Google Inc. +Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| + +| Copyright 2008 Google Inc. All rights reserved. +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -293,7 +295,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -305,40 +307,26 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. -------------------------------------------------------------------------------- -This product bundles Google Auth Library. +This product bundles Apache Datasketches. -License: BSD 3-Clause -| Copyright 2014, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Project URL: https://datasketches.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Project URL: https://github.com/airlift/aircompressor +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -401,82 +389,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. - -Copyright: 2008 Google Inc. -Project URL: https://developers.google.com/protocol-buffers -License: BSD 3-Clause -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. @@ -530,15 +442,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (core and client). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java @@ -622,143 +526,6 @@ License: BSD 2-Clause -------------------------------------------------------------------------------- -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google API Common. - -License: BSD 3-Clause -| Copyright 2016, Google Inc. -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Http Client. - -Project URL: https://www.google.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles Okio. - -Project URL: https://github.com/square/okio -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Netty. - -Project URL: https://netty.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google APIs. - -Project URL: https://github.com/googleapis/googleapis -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud APIs for Java. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts @@ -766,98 +533,3 @@ License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/e -------------------------------------------------------------------------------- -This product bundles javax.annotation-api. - -Project URL: https://javaee.github.io/glassfish -Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles Apache Arrow. - -Project URL: https://github.com/apache/arrow -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google flatbuffers. - -Project URL: https://github.com/google/flatbuffers -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v1.20/flink-runtime/NOTICE b/flink/v1.20/flink-runtime/NOTICE index 7603ddaedd9f..61b02129d0e1 100644 --- a/flink/v1.20/flink-runtime/NOTICE +++ b/flink/v1.20/flink-runtime/NOTICE @@ -356,128 +356,6 @@ This product bundles Eclipse Microprofile OpenAPI with the following in its NOTI -------------------------------------------------------------------------------- -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa - --------------------------------------------------------------------------------- - -This product bundles Netty with the following in its NOTICE file: -| -| The Netty Project -| ================= -| -| Please visit the Netty web site for more information: -| -| * http://netty.io/ -| -| Copyright 2016 The Netty Project -| -| The Netty Project licenses this file to you under the Apache License, -| version 2.0 (the "License"); you may not use this file except in compliance -| with the License. You may obtain a copy of the License at: -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| License for the specific language governing permissions and limitations -| under the License. -| -| ------------------------------------------------------------------------------- -| This product contains a forked and modified version of Tomcat Native -| -| * LICENSE: -| * license/LICENSE.tomcat-native.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://tomcat.apache.org/native-doc/ -| * https://svn.apache.org/repos/asf/tomcat/native/ -| -| This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| -| * LICENSE: -| * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/takari/maven-wrapper -| -| This product contains small piece of code to support AIX, taken from netbsd. -| -| * LICENSE: -| * license/LICENSE.aix-netbsd.txt (OpenSSL License) -| * HOMEPAGE: -| * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| -| This product contains code from boringssl. -| -| * LICENSE (Combination ISC and OpenSSL license) -| * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) -| * HOMEPAGE: -| * https://boringssl.googlesource.com/boringssl/ - --------------------------------------------------------------------------------- - This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor | @@ -511,82 +389,3 @@ This product bundles Jackson JSON Processor with the following in its NOTICE fil | | See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser | and the licenses and copyrights that apply to that code. - --------------------------------------------------------------------------------- - -This product bundles Perfmark with the following in its NOTICE file: -| -| Copyright 2019 Google LLC -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'Catapult', an open source -| Trace Event viewer for Chome, Linux, and Android applications, which can -| be obtained at: -| -| * LICENSE: -| * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/catapult/LICENSE (New BSD License) -| * HOMEPAGE: -| * https://github.com/catapult-project/catapult -| -| This product contains a modified portion of 'Polymer', a library for Web -| Components, which can be obtained at: -| * LICENSE: -| * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/polymer/LICENSE (New BSD License) -| * HOMEPAGE: -| * https://github.com/Polymer/polymer -| -| -| This product contains a modified portion of 'ASM', an open source -| Java Bytecode library, which can be obtained at: -| -| * LICENSE: -| * agent/src/main/resources/io/perfmark/agent/third_party/asm/LICENSE (BSD style License) -| * HOMEPAGE: -| * https://asm.ow2.io/ - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (uber-jar) with the following in its NOTICE: -| Copyright 2016 The Android Open Source Project -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| This product contains a modified portion of `Netty`, a configurable network -| stack in Java, which can be obtained at: -| -| * LICENSE: -| * licenses/LICENSE.netty.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://netty.io/ -| -| This product contains a modified portion of `Apache Harmony`, modular Java runtime, -| which can be obtained at: -| -| * LICENSE: -| * licenses/LICENSE.harmony.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://harmony.apache.org/ diff --git a/flink/v2.0/flink-runtime/LICENSE b/flink/v2.0/flink-runtime/LICENSE index e8c4c4a0bdf7..11460c3307c8 100644 --- a/flink/v2.0/flink-runtime/LICENSE +++ b/flink/v2.0/flink-runtime/LICENSE @@ -227,7 +227,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Thrift. +This product bundles Apache Thrift (bundled by Parquet). Copyright: 2006-2010 The Apache Software Foundation. Project URL: https://thrift.apache.org/ @@ -235,55 +235,57 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil. +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). -Copyright: 2002-2014 Sebastiano Vigna -Project URL: http://fastutil.di.unimi.it/ +Copyright: 2013 Daniel Lemire +Project URL: https://github.com/lemire/JavaFastPFOR License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache ORC. +This product bundles Fastutil (bundled by Parquet). -Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://orc.apache.org/ +Copyright: 2002-2014 Sebastiano Vigna +Project URL: http://fastutil.di.unimi.it/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Datasketches. +This product bundles Zero-Allocation Hashing (bundled by Parquet). -Project URL: https://datasketches.apache.org/ +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Apache Hive. +This product bundles Apache ORC. Copyright: 2013-2020 The Apache Software Foundation. -Project URL: https://hive.apache.org/ +Project URL: https://orc.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Airlift Aircompressor. +This product bundles Apache Hive's Storage API (bundled by ORC). -Copyright: 2011-2020 Aircompressor authors. -Project URL: https://github.com/airlift/aircompressor +Copyright: 2008-2020 The Apache Software Foundation. +Project URL: https://hive.apache.org/ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- -This product bundles Google GAX. +This product bundles Google protobuf (bundled by ORC). -Project URL: https://github.com/googleapis/gax-java +Copyright: 2008 Google Inc. +Project URL: https://developers.google.com/protocol-buffers License: BSD 3-Clause -| Copyright 2016, Google Inc. All rights reserved. -| + +| Copyright 2008 Google Inc. All rights reserved. +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -293,7 +295,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -305,40 +307,26 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +| +| Code generated by the Protocol Buffer compiler is owned by the owner +| of the input file used when generating it. This code is not +| standalone and requires a support library to be linked with it. This +| support library is itself covered by the above license. -------------------------------------------------------------------------------- -This product bundles Google Auth Library. +This product bundles Apache Datasketches. -License: BSD 3-Clause -| Copyright 2014, Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +Project URL: https://datasketches.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Airlift Aircompressor. + +Copyright: 2011-2020 Aircompressor authors. +Project URL: https://github.com/airlift/aircompressor +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -401,82 +389,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google protobuf. - -Copyright: 2008 Google Inc. -Project URL: https://developers.google.com/protocol-buffers -License: BSD 3-Clause -| Copyright 2008 Google Inc. All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| -| Code generated by the Protocol Buffer compiler is owned by the owner -| of the input file used when generating it. This code is not -| standalone and requires a support library to be linked with it. This -| support library is itself covered by the above license. - --------------------------------------------------------------------------------- - -This product bundles ThreeTen BP. - -Project URL: https://www.threeten.org/threetenbp -License: BSD 3-Clause -| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| -| All rights reserved. -| -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are met: -| -| * Redistributions of source code must retain the above copyright notice, -| this list of conditions and the following disclaimer. -| -| * Redistributions in binary form must reproduce the above copyright notice, -| this list of conditions and the following disclaimer in the documentation -| and/or other materials provided with the distribution. -| -| * Neither the name of JSR-310 nor the names of its contributors -| may be used to endorse or promote products derived from this software -| without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. @@ -530,15 +442,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache HttpComponents (core and client). - -Copyright: 1999-2022 The Apache Software Foundation. -Project URL: https://hc.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product includes code from Apache HttpComponents Client. +This product bundles and includes code from Apache HttpComponents (core/client). * retry and error handling logic in ExponentialHttpRequestRetryStrategy.java @@ -622,143 +526,6 @@ License: BSD 2-Clause -------------------------------------------------------------------------------- -This product bundles Google Cloud BigQuery Client for Java. - -Project URL: https://github.com/googleapis/java-bigquery -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Credentials for Java. - -Project URL: https://github.com/aliyun/credentials-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Alibaba Cloud Tea for Java. - -Project URL: https://github.com/aliyun/tea-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Gson. - -Project URL: https://github.com/google/gson/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google API Common. - -License: BSD 3-Clause -| Copyright 2016, Google Inc. -| Redistribution and use in source and binary forms, with or without -| modification, are permitted provided that the following conditions are -| met: -| -| * Redistributions of source code must retain the above copyright -| notice, this list of conditions and the following disclaimer. -| * Redistributions in binary form must reproduce the above -| copyright notice, this list of conditions and the following disclaimer -| in the documentation and/or other materials provided with the -| distribution. -| * Neither the name of Google Inc. nor the names of its -| contributors may be used to endorse or promote products derived from -| this software without specific prior written permission. -| -| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - --------------------------------------------------------------------------------- - -This product bundles Google Http Client. - -Project URL: https://www.google.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenCensus. - -Project URL: https://github.com/census-instrumentation/opencensus-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles JaCoCo. - -Project URL: https://github.com/jacoco/jacoco/ -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt - --------------------------------------------------------------------------------- - -This product bundles JAXB. - -Project URL: http://jaxb.java.net/ -License: CDDL 1.1 - https://glassfish.java.net/public/CDDL+GPL_1_1.html - --------------------------------------------------------------------------------- - -This product bundles Okio. - -Project URL: https://github.com/square/okio -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OkHttp. - -Project URL: https://github.com/square/okhttp -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Auto Value. - -Project URL: https://github.com/google/auto/tree/master/value -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles gRPC. - -Project URL: https://github.com/grpc/grpc-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Netty. - -Project URL: https://netty.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google APIs. - -Project URL: https://github.com/googleapis/googleapis -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google Cloud APIs for Java. - -Project URL: https://github.com/googleapis/google-cloud-java -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts @@ -766,98 +533,3 @@ License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/e -------------------------------------------------------------------------------- -This product bundles javax.annotation-api. - -Project URL: https://javaee.github.io/glassfish -Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles JSpecify. - -Project URL: https://github.com/jspecify/jspecify -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Animal Sniffer Annotations. - -License: MIT -| The MIT License -| -| Copyright (c) 2009 codehaus.org. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in -| all copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -| THE SOFTWARE. - --------------------------------------------------------------------------------- - -This product bundles Android Annotations. - -Project URL: http://source.android.com/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Conscrypt (openjdk-uber). - -Project URL: https://conscrypt.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Perfmark. - -Project URL: https://github.com/perfmark/perfmark -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles org.json. - -Project URL: https://github.com/douglascrockford/JSON-java -License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENSE - --------------------------------------------------------------------------------- - -This product bundles Apache Arrow. - -Project URL: https://github.com/apache/arrow -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Google flatbuffers. - -Project URL: https://github.com/google/flatbuffers -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles OpenTelemetry. - -Project URL: https://opentelemetry.io/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Kotlin. - -Project URL: https://github.com/JetBrains/kotlin -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 diff --git a/flink/v2.0/flink-runtime/NOTICE b/flink/v2.0/flink-runtime/NOTICE index 72916788b5e4..61b02129d0e1 100644 --- a/flink/v2.0/flink-runtime/NOTICE +++ b/flink/v2.0/flink-runtime/NOTICE @@ -356,128 +356,6 @@ This product bundles Eclipse Microprofile OpenAPI with the following in its NOTI -------------------------------------------------------------------------------- -This product bundles gRPC with the following in its NOTICE file: -| Copyright 2014 The gRPC Authors -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. -| -| ----------------------------------------------------------------------- -| -| This product contains a modified portion of 'OkHttp', an open source -| HTTP & SPDY client for Android and Java applications, which can be obtained -| at: -| -| * LICENSE: -| * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/square/okhttp -| * LOCATION_IN_GRPC: -| * okhttp/third_party/okhttp -| -| This product contains a modified portion of 'Envoy', an open source -| cloud-native high-performance edge/middle/service proxy, which can be -| obtained at: -| -| * LICENSE: -| * xds/third_party/envoy/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/envoy/NOTICE -| * HOMEPAGE: -| * https://www.envoyproxy.io -| * LOCATION_IN_GRPC: -| * xds/third_party/envoy -| -| This product contains a modified portion of 'protoc-gen-validate (PGV)', -| an open source protoc plugin to generate polyglot message validators, -| which can be obtained at: -| -| * LICENSE: -| * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) -| * NOTICE: -| * xds/third_party/protoc-gen-validate/NOTICE -| * HOMEPAGE: -| * https://github.com/envoyproxy/protoc-gen-validate -| * LOCATION_IN_GRPC: -| * xds/third_party/protoc-gen-validate -| -| This product contains a modified portion of 'udpa', -| an open source universal data plane API, which can be obtained at: -| -| * LICENSE: -| * xds/third_party/udpa/LICENSE (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/cncf/udpa -| * LOCATION_IN_GRPC: -| * xds/third_party/udpa - --------------------------------------------------------------------------------- - -This product bundles Netty with the following in its NOTICE file: -| -| The Netty Project -| ================= -| -| Please visit the Netty web site for more information: -| -| * http://netty.io/ -| -| Copyright 2016 The Netty Project -| -| The Netty Project licenses this file to you under the Apache License, -| version 2.0 (the "License"); you may not use this file except in compliance -| with the License. You may obtain a copy of the License at: -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -| WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -| License for the specific language governing permissions and limitations -| under the License. -| -| ------------------------------------------------------------------------------- -| This product contains a forked and modified version of Tomcat Native -| -| * LICENSE: -| * license/LICENSE.tomcat-native.txt (Apache License 2.0) -| * HOMEPAGE: -| * http://tomcat.apache.org/native-doc/ -| * https://svn.apache.org/repos/asf/tomcat/native/ -| -| This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| -| * LICENSE: -| * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) -| * HOMEPAGE: -| * https://github.com/takari/maven-wrapper -| -| This product contains small piece of code to support AIX, taken from netbsd. -| -| * LICENSE: -| * license/LICENSE.aix-netbsd.txt (OpenSSL License) -| * HOMEPAGE: -| * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| -| This product contains code from boringssl. -| -| * LICENSE (Combination ISC and OpenSSL license) -| * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) -| * HOMEPAGE: -| * https://boringssl.googlesource.com/boringssl/ - --------------------------------------------------------------------------------- - This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor | From 3dd5c146748e0f0eaee627457ed51d97759ee68c Mon Sep 17 00:00:00 2001 From: Talat UYARER Date: Thu, 30 Apr 2026 09:01:28 -0700 Subject: [PATCH 135/197] Flink: Add Nanosecond Precision Support for Flink-Iceberg Integration (#15475) --- LICENSE | 4 + flink/v2.1/build.gradle | 1 + .../apache/iceberg/flink/FlinkTypeToType.java | 6 + .../apache/iceberg/flink/RowDataWrapper.java | 36 +- .../iceberg/flink/data/FlinkOrcReader.java | 7 + .../iceberg/flink/data/FlinkOrcWriter.java | 7 + .../iceberg/flink/data/FlinkOrcWriters.java | 37 ++ .../iceberg/flink/data/RowDataUtil.java | 2 + .../iceberg/flink/data/StructRowData.java | 52 +- .../formats/avro/AvroToRowDataConverters.java | 303 +++++++++ .../flink/formats/avro/JodaConverter.java | 69 ++ .../formats/avro/RowDataToAvroConverters.java | 394 +++++++++++ .../avro/typeutils/AvroSchemaConverter.java | 625 ++++++++++++++++++ .../AvroGenericRecordToRowDataMapper.java | 4 +- .../RowDataToAvroGenericRecordConverter.java | 4 +- .../reader/AvroGenericRecordConverter.java | 4 +- .../apache/iceberg/flink/DataGenerators.java | 105 ++- .../iceberg/flink/TestRowDataWrapper.java | 13 - .../flink/data/TestFlinkOrcReaderWriter.java | 5 + .../flink/data/TestRowDataProjection.java | 21 +- 20 files changed, 1646 insertions(+), 53 deletions(-) create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java diff --git a/LICENSE b/LICENSE index 0f907148aa13..573a126294a6 100644 --- a/LICENSE +++ b/LICENSE @@ -336,6 +336,10 @@ This product includes code from Apache Flink. * Parameter provider annotation for parameterized tests in Parameters.java * Parameter field annotation for parameterized tests in Parameter.java * Primary key validation logic in FlinkSchemaUtil.java +* Avro to RowData conversion logic in AvroToRowDataConverters.java +* RowData to Avro conversion logic in RowDataToAvroConverters.java +* Avro schema conversion logic in AvroSchemaConverter.java +* Joda optional dependency encapsulation in JodaConverter.java Copyright: 1999-2022 The Apache Software Foundation. Home page: https://flink.apache.org/ diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index 6dc373e6b566..9eb09cf021f9 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -33,6 +33,7 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink21.avro + compileOnly 'joda-time:joda-time:2.8.1' // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink21.metrics.dropwizard compileOnly libs.flink21.streaming.java diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 377811247233..bfcd34d0b999 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -138,11 +138,17 @@ public Type visit(TimeType timeType) { @Override public Type visit(TimestampType timestampType) { + if (timestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withoutZone(); + } return Types.TimestampType.withoutZone(); } @Override public Type visit(LocalZonedTimestampType localZonedTimestampType) { + if (localZonedTimestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withZone(); + } return Types.TimestampType.withZone(); } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 3ef611f2ded5..920e44b24b31 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -114,19 +114,35 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.nanosFromTimestamp(localDateTime); + }; + } else { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + } case TIMESTAMP_WITH_LOCAL_TIME_ZONE: LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + }; + } else { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + }; + } case ROW: RowType rowType = (RowType) logicalType; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 65b9d44ad4b8..3e3a29112cf4 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -112,6 +112,13 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio } else { return FlinkOrcReaders.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } case STRING: return FlinkOrcReaders.strings(); case UUID: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index a467d848337d..c1b46252e18a 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -145,6 +145,13 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl } else { return FlinkOrcWriters.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampNanoTzs(); + } else { + return FlinkOrcWriters.timestampNanos(); + } case STRING: return FlinkOrcWriters.strings(); case UUID: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 684842aa099c..bf19a46c05fb 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -70,6 +70,14 @@ static OrcValueWriter timestampTzs() { return TimestampTzWriter.INSTANCE; } + static OrcValueWriter timestampNanos() { + return TimestampNanoWriter.INSTANCE; + } + + static OrcValueWriter timestampNanoTzs() { + return TimestampNanoTzWriter.INSTANCE; + } + static OrcValueWriter decimals(int precision, int scale) { if (precision <= 18) { return new Decimal18Writer(precision, scale); @@ -170,6 +178,35 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { } } + private static class TimestampNanoWriter implements OrcValueWriter { + private static final TimestampNanoWriter INSTANCE = new TimestampNanoWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.nanos[rowId] = offsetDateTime.getNano(); + } + } + + private static class TimestampNanoTzWriter implements OrcValueWriter { + private static final TimestampNanoTzWriter INSTANCE = new TimestampNanoTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + cv.nanos[rowId] = instant.getNano(); + } + } + private static class Decimal18Writer implements OrcValueWriter { private final int precision; private final int scale; diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index f23a7ee3d0d3..81bb55967992 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -69,6 +69,8 @@ public static Object convertConstant(Type type, Object value) { return (int) ((Long) value / 1000); case TIMESTAMP: // TimestampData return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + case TIMESTAMP_NANO: + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromNanos((Long) value)); case UUID: return UUIDUtil.convert((UUID) value); default: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index b93e4346a47a..40a2d91f87f8 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -49,6 +49,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; @Internal public class StructRowData implements RowData { @@ -121,8 +122,8 @@ public int getInt(int pos) { if (integer instanceof Integer) { return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalDate localDate) { + return (int) localDate.toEpochDay(); } else if (integer instanceof LocalTime) { return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); } else { @@ -186,8 +187,27 @@ private BigDecimal getDecimalInternal(int pos) { @Override public TimestampData getTimestamp(int pos, int precision) { + if (precision > 6) { + Object timeVal = struct.get(pos, Object.class); + if (timeVal instanceof OffsetDateTime) { + OffsetDateTime odt = (OffsetDateTime) timeVal; + return TimestampData.fromEpochMillis( + odt.toInstant().toEpochMilli(), odt.getNano() % 1_000_000); + } else if (timeVal instanceof LocalDateTime) { + LocalDateTime ldt = (LocalDateTime) timeVal; + return TimestampData.fromEpochMillis( + ldt.toInstant(ZoneOffset.UTC).toEpochMilli(), ldt.getNano() % 1_000_000); + } else if (timeVal instanceof Long) { + long timeLong = (Long) timeVal; + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } else { + throw new IllegalStateException("Unknown type for timestamp_ns: " + timeVal.getClass()); + } + } long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1000); } @Override @@ -263,9 +283,29 @@ private Object convertValue(Type elementType, Object value) { case DECIMAL: return value; case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + long timeMillis; + if (value instanceof LocalDateTime localDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamp(localDateTime) / 1000L; + } else if (value instanceof OffsetDateTime offsetDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamptz(offsetDateTime) / 1000L; + } else { + timeMillis = Math.floorDiv((Long) value, 1000L); + } + return TimestampData.fromEpochMillis( + timeMillis, + (int) Math.floorMod(value instanceof Long ? (Long) value : timeMillis * 1000L, 1000L) + * 1000); + case TIMESTAMP_NANO: + long nanoLong; + if (value instanceof LocalDateTime localDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamp(localDateTime); + } else if (value instanceof OffsetDateTime offsetDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamptz(offsetDateTime); + } else { + nanoLong = (Long) value; + } + return TimestampData.fromEpochMillis( + Math.floorDiv(nanoLong, 1_000_000L), (int) Math.floorMod(nanoLong, 1_000_000L)); case STRING: return StringData.fromString(value.toString()); case FIXED: diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java new file mode 100644 index 000000000000..0f70e60a1b9f --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.temporal.ChronoField; +import java.util.List; +import java.util.Map; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.utils.LogicalTypeUtils; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Tool class used to convert from Avro {@link GenericRecord} to {@link RowData}. + * + *

      This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class AvroToRowDataConverters { + + private AvroToRowDataConverters() {} + + /** + * Runtime converter that converts Avro data structures into objects of Flink Table & SQL + * internal data structures. + */ + @FunctionalInterface + public interface AvroToRowDataConverter extends Serializable { + Object convert(Object object); + } + + // ------------------------------------------------------------------------------------- + // Runtime Converters + // ------------------------------------------------------------------------------------- + + public static AvroToRowDataConverter createRowConverter(RowType rowType) { + return createRowConverter(rowType, true); + } + + public static AvroToRowDataConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter[] fieldConverters = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .map(type -> createNullableConverter(type, legacyTimestampMapping)) + .toArray(AvroToRowDataConverter[]::new); + final int arity = rowType.getFieldCount(); + + return avroObject -> { + IndexedRecord record = (IndexedRecord) avroObject; + GenericRowData row = new GenericRowData(arity); + for (int i = 0; i < arity; ++i) { + // avro always deserialize successfully even though the type isn't matched + // so no need to throw exception about which field can't be deserialized + row.setField(i, fieldConverters[i].convert(record.get(i))); + } + return row; + }; + } + + /** Creates a runtime converter which is null safe. */ + private static AvroToRowDataConverter createNullableConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter converter = createConverter(type, legacyTimestampMapping); + return avroObject -> { + if (avroObject == null) { + return null; + } + return converter.convert(avroObject); + }; + } + + /** Creates a runtime converter which assuming input object is not null. */ + private static AvroToRowDataConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + switch (type.getTypeRoot()) { + case NULL: + return avroObject -> null; + case TINYINT: + return avroObject -> ((Integer) avroObject).byteValue(); + case SMALLINT: + return avroObject -> ((Integer) avroObject).shortValue(); + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + return avroObject -> avroObject; + case DATE: + return AvroToRowDataConverters::convertToDate; + case TIME_WITHOUT_TIME_ZONE: + return AvroToRowDataConverters::convertToTime; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + } + case CHAR: + case VARCHAR: + return avroObject -> StringData.fromString(avroObject.toString()); + case BINARY: + case VARBINARY: + return AvroToRowDataConverters::convertToBytes; + case DECIMAL: + return createDecimalConverter((DecimalType) type); + case ARRAY: + return createArrayConverter((ArrayType) type, legacyTimestampMapping); + case ROW: + return createRowConverter((RowType) type); + case MAP: + case MULTISET: + return createMapConverter(type, legacyTimestampMapping); + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) { + final int precision = decimalType.getPrecision(); + final int scale = decimalType.getScale(); + return avroObject -> { + final byte[] bytes; + if (avroObject instanceof GenericFixed) { + bytes = ((GenericFixed) avroObject).bytes(); + } else if (avroObject instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) avroObject; + bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + } else { + bytes = (byte[]) avroObject; + } + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + }; + } + + private static AvroToRowDataConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter elementConverter = + createNullableConverter(arrayType.getElementType(), legacyTimestampMapping); + final Class elementClass = + LogicalTypeUtils.toInternalConversionClass(arrayType.getElementType()); + + return avroObject -> { + final List list = (List) avroObject; + final int length = list.size(); + final Object[] array = (Object[]) Array.newInstance(elementClass, length); + for (int i = 0; i < length; ++i) { + array[i] = elementConverter.convert(list.get(i)); + } + return new GenericArrayData(array); + }; + } + + private static AvroToRowDataConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter keyConverter = + createConverter(DataTypes.STRING().getLogicalType(), legacyTimestampMapping); + final AvroToRowDataConverter valueConverter = + createNullableConverter( + AvroSchemaConverter.extractValueTypeToAvroMap(type), legacyTimestampMapping); + + return avroObject -> { + final Map map = (Map) avroObject; + Map result = Maps.newHashMap(); + for (Map.Entry entry : map.entrySet()) { + Object key = keyConverter.convert(entry.getKey()); + Object value = valueConverter.convert(entry.getValue()); + result.put(key, value); + } + return new GenericMapData(result); + }; + } + + private static TimestampData convertToTimestamp(Object object, LogicalType type) { + int precision = 3; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + precision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + precision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } + + if (object instanceof Long) { + long timeLong = (Long) object; + if (precision <= 3) { + return TimestampData.fromEpochMillis(timeLong); + } else if (precision <= 6) { + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1_000_000); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } + } else if (object instanceof Instant) { + return TimestampData.fromInstant((Instant) object); + } else if (object instanceof LocalDateTime) { + return TimestampData.fromLocalDateTime((LocalDateTime) object); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return TimestampData.fromEpochMillis(jodaConverter.convertTimestamp(object)); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIMESTAMP logical type. Received: " + object); + } + } + } + + private static int convertToDate(Object object) { + if (object instanceof Integer) { + return (Integer) object; + } else if (object instanceof LocalDate) { + return (int) ((LocalDate) object).toEpochDay(); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return (int) jodaConverter.convertDate(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for DATE logical type. Received: " + object); + } + } + } + + private static int convertToTime(Object object) { + final int millis; + if (object instanceof Integer) { + millis = (Integer) object; + } else if (object instanceof LocalTime) { + millis = ((LocalTime) object).get(ChronoField.MILLI_OF_DAY); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + millis = jodaConverter.convertTime(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIME logical type. Received: " + object); + } + } + return millis; + } + + private static byte[] convertToBytes(Object object) { + if (object instanceof GenericFixed) { + return ((GenericFixed) object).bytes(); + } else if (object instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) object; + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return bytes; + } else { + return (byte[]) object; + } + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java new file mode 100644 index 000000000000..c30b78023345 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import org.joda.time.DateTime; +import org.joda.time.DateTimeFieldType; +import org.joda.time.LocalDate; +import org.joda.time.LocalTime; + +/** + * Encapsulates joda optional dependency. Instantiates this class only if joda is available on the + * classpath. + */ +@SuppressWarnings("JavaUtilDate") +class JodaConverter { + + private static JodaConverter instance; + private static boolean instantiated = false; + + public static JodaConverter getConverter() { + if (instantiated) { + return instance; + } + + try { + Class.forName( + "org.joda.time.DateTime", false, Thread.currentThread().getContextClassLoader()); + instance = new JodaConverter(); + } catch (ClassNotFoundException e) { + instance = null; + } finally { + instantiated = true; + } + return instance; + } + + public long convertDate(Object object) { + final LocalDate value = (LocalDate) object; + return value.toDate().getTime(); + } + + public int convertTime(Object object) { + final LocalTime value = (LocalTime) object; + return value.get(DateTimeFieldType.millisOfDay()); + } + + public long convertTimestamp(Object object) { + final DateTime value = (DateTime) object; + return value.toDate().getTime(); + } + + private JodaConverter() {} +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java new file mode 100644 index 000000000000..d4c7e4282d6e --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Tool class used to convert from {@link RowData} to Avro {@link GenericRecord}. + * + *

      This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class RowDataToAvroConverters { + + private RowDataToAvroConverters() {} + + // -------------------------------------------------------------------------------- + // Runtime Converters + // -------------------------------------------------------------------------------- + + /** + * Runtime converter that converts objects of Flink Table & SQL internal data structures to + * corresponding Avro data structures. + */ + @FunctionalInterface + public interface RowDataToAvroConverter extends Serializable { + Object convert(Schema schema, Object object); + } + + // -------------------------------------------------------------------------------- + // IMPORTANT! We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in + // SerializedLambdas (MSHADE-260). On the other hand we want to relocate Avro for + // sql-client uber jars. + // -------------------------------------------------------------------------------- + + /** + * Creates a runtime converter according to the given logical type that converts objects of Flink + * Table & SQL internal data structures to corresponding Avro data structures. + */ + public static RowDataToAvroConverter createConverter(LogicalType type) { + return createConverter(type, true); + } + + @SuppressWarnings("checkstyle:MethodLength") + public static RowDataToAvroConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + final RowDataToAvroConverter converter; + switch (type.getTypeRoot()) { + case NULL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return null; + } + }; + break; + case TINYINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Byte) object).intValue(); + } + }; + break; + case SMALLINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Short) object).intValue(); + } + }; + break; + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case TIME_WITHOUT_TIME_ZONE: // int + case DATE: // int + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return object; + } + }; + break; + case CHAR: + case VARCHAR: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return new Utf8(object.toString()); + } + }; + break; + case BINARY: + case VARBINARY: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap((byte[]) object); + } + }; + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int tzPrecision; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + tzPrecision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else { + tzPrecision = 3; + } + if (legacyTimestampMapping) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (tzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (tzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + java.time.Instant instant = + timestampData.toLocalDateTime().toInstant(ZoneOffset.UTC); + if (tzPrecision <= 3) { + return instant.toEpochMilli(); + } else if (tzPrecision <= 6) { + return instant.getEpochSecond() * 1_000_000L + instant.getNano() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return instant.getEpochSecond() * 1_000_000_000L + instant.getNano(); + } + } + }; + } + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int ltzPrecision; + if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + ltzPrecision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } else { + ltzPrecision = 3; + } + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (ltzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (ltzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } + break; + case DECIMAL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap(((DecimalData) object).toUnscaledBytes()); + } + }; + break; + case ARRAY: + converter = createArrayConverter((ArrayType) type, legacyTimestampMapping); + break; + case ROW: + converter = createRowConverter((RowType) type, legacyTimestampMapping); + break; + case MAP: + case MULTISET: + converter = createMapConverter(type, legacyTimestampMapping); + break; + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + + // wrap into nullable converter + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + if (object == null) { + return null; + } + + // get actual schema if it is a nullable schema + Schema actualSchema; + if (schema.getType() == Schema.Type.UNION) { + List types = schema.getTypes(); + int size = types.size(); + if (size == 2 && types.get(1).getType() == Schema.Type.NULL) { + actualSchema = types.get(0); + } else if (size == 2 && types.get(0).getType() == Schema.Type.NULL) { + actualSchema = types.get(1); + } else { + throw new IllegalArgumentException( + "The Avro schema is not a nullable type: " + schema.toString()); + } + } else { + actualSchema = schema; + } + return converter.convert(actualSchema, object); + } + }; + } + + private static RowDataToAvroConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final RowDataToAvroConverter[] fieldConverters = + rowType.getChildren().stream() + .map(legacyType -> createConverter(legacyType, legacyTimestampMapping)) + .toArray(RowDataToAvroConverter[]::new); + final LogicalType[] fieldTypes = + rowType.getFields().stream().map(RowType.RowField::getType).toArray(LogicalType[]::new); + final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fieldTypes.length]; + for (int i = 0; i < fieldTypes.length; i++) { + fieldGetters[i] = RowData.createFieldGetter(fieldTypes[i], i); + } + final int length = rowType.getFieldCount(); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final RowData row = (RowData) object; + final List fields = schema.getFields(); + final GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < length; ++i) { + final Schema.Field schemaField = fields.get(i); + try { + Object avroObject = + fieldConverters[i].convert( + schemaField.schema(), fieldGetters[i].getFieldOrNull(row)); + record.put(i, avroObject); + } catch (Throwable t) { + throw new RuntimeException( + String.format("Fail to serialize at field: %s.", schemaField.name()), t); + } + } + return record; + } + }; + } + + private static RowDataToAvroConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + LogicalType elementType = arrayType.getElementType(); + final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + final RowDataToAvroConverter elementConverter = + createConverter(arrayType.getElementType(), legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema elementSchema = schema.getElementType(); + ArrayData arrayData = (ArrayData) object; + List list = Lists.newArrayList(); + for (int i = 0; i < arrayData.size(); ++i) { + list.add( + elementConverter.convert( + elementSchema, elementGetter.getElementOrNull(arrayData, i))); + } + return list; + } + }; + } + + private static RowDataToAvroConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); + final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); + final RowDataToAvroConverter valueConverter = + createConverter(valueType, legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema valueSchema = schema.getValueType(); + final MapData mapData = (MapData) object; + final ArrayData keyArray = mapData.keyArray(); + final ArrayData valueArray = mapData.valueArray(); + final Map map = CollectionUtil.newHashMapWithExpectedSize(mapData.size()); + for (int i = 0; i < mapData.size(); ++i) { + final String key = keyArray.getString(i).toString(); + final Object value = + valueConverter.convert(valueSchema, valueGetter.getElementOrNull(valueArray, i)); + map.put(key, value); + } + return map; + } + }; + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java new file mode 100644 index 000000000000..fb77c124e504 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java @@ -0,0 +1,625 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro.typeutils; + +import java.util.List; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.specific.SpecificData; +import org.apache.avro.specific.SpecificRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.formats.avro.AvroRowDataDeserializationSchema; +import org.apache.flink.formats.avro.AvroRowDataSerializationSchema; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.legacy.types.logical.TypeInformationRawType; +import org.apache.flink.table.types.AtomicDataType; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Converts an Avro schema into Flink's type information. It uses {@link RowTypeInfo} for + * representing objects and converts Avro types into types that are compatible with Flink's Table + * & SQL API. + * + *

      Note: Changes in this class need to be kept in sync with the corresponding runtime classes + * {@link AvroRowDataDeserializationSchema} and {@link AvroRowDataSerializationSchema}. + * + *

      This class is adapted in Iceberg to support custom 'timestamp-nanos' and + * 'local-timestamp-nanos' logical types (FLINK-39251). Once that ticket is resolved in Flink, these + * custom types may be removed. + */ +public class AvroSchemaConverter { + + private AvroSchemaConverter() { + // private + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass) { + return convertToTypeInfo(avroClass, true); + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroClass, "Avro specific record class must not be null."); + // determine schema to retrieve deterministic field order + final Schema schema = SpecificData.get().getSchema(avroClass); + return (TypeInformation) convertToTypeInfo(schema, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo(String avroSchemaString) { + return convertToTypeInfo(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return (TypeInformation) convertToTypeInfo(schema, legacyTimestampMapping); + } + + private static TypeInformation convertToTypeInfo( + Schema schema, boolean legacyTimestampMapping) { + switch (schema.getType()) { + case RECORD: + final List fields = schema.getFields(); + + final TypeInformation[] types = new TypeInformation[fields.size()]; + final String[] names = new String[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + final Schema.Field field = fields.get(i); + types[i] = convertToTypeInfo(field.schema(), legacyTimestampMapping); + names[i] = field.name(); + } + return Types.ROW_NAMED(names, types); + case ENUM: + return Types.STRING; + case ARRAY: + // result type might either be ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings + return Types.OBJECT_ARRAY( + convertToTypeInfo(schema.getElementType(), legacyTimestampMapping)); + case MAP: + return Types.MAP( + Types.STRING, convertToTypeInfo(schema.getValueType(), legacyTimestampMapping)); + case UNION: + final Schema actualSchema; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + } else { + // use Kryo for serialization + return Types.GENERIC(Object.class); + } + return convertToTypeInfo(actualSchema, legacyTimestampMapping); + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + // convert fixed size binary data to primitive byte arrays + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case STRING: + // convert Avro's Utf8/CharSequence to String + return Types.STRING; + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return Types.SQL_DATE; + } else if (logicalType == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + return Types.INT; + case LONG: + if (legacyTimestampMapping) { + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.SQL_TIMESTAMP; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } else { + // Avro logical timestamp types to Flink DataStream timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.INSTANT; + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis() + || schema.getLogicalType() == LogicalTypes.localTimestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos"))) { + return Types.LOCAL_DATE_TIME; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } + return Types.LONG; + case FLOAT: + return Types.FLOAT; + case DOUBLE: + return Types.DOUBLE; + case BOOLEAN: + return Types.BOOLEAN; + case NULL: + return Types.VOID; + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return data type matching the schema + */ + public static DataType convertToDataType(String avroSchemaString) { + return convertToDataType(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of local timestamps + * @return data type matching the schema + */ + public static DataType convertToDataType( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return convertToDataType(schema, legacyTimestampMapping); + } + + @SuppressWarnings("deprecation") + private static DataType convertToDataType(Schema schema, boolean legacyMapping) { + switch (schema.getType()) { + case RECORD: + final List schemaFields = schema.getFields(); + + final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; + for (int i = 0; i < schemaFields.size(); i++) { + final Schema.Field field = schemaFields.get(i); + fields[i] = + DataTypes.FIELD(field.name(), convertToDataType(field.schema(), legacyMapping)); + } + return DataTypes.ROW(fields).notNull(); + case ENUM: + return DataTypes.STRING().notNull(); + case ARRAY: + return DataTypes.ARRAY(convertToDataType(schema.getElementType(), legacyMapping)).notNull(); + case MAP: + return DataTypes.MAP( + DataTypes.STRING().notNull(), + convertToDataType(schema.getValueType(), legacyMapping)) + .notNull(); + case UNION: + final Schema actualSchema; + final boolean nullable; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + nullable = true; + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + nullable = true; + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + nullable = false; + } else { + // use Kryo for serialization + return new AtomicDataType( + new TypeInformationRawType<>(false, Types.GENERIC(Object.class))); + } + DataType converted = convertToDataType(actualSchema, legacyMapping); + return nullable ? converted.nullable() : converted; + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + // convert fixed size binary data to primitive byte arrays + return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); + case STRING: + // convert Avro's Utf8/CharSequence to String + return DataTypes.STRING().notNull(); + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + return DataTypes.BYTES().notNull(); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return DataTypes.DATE().notNull(); + } else if (logicalType == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } + return DataTypes.INT().notNull(); + case LONG: + if (legacyMapping) { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } + } else { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } + } + + return DataTypes.BIGINT().notNull(); + case FLOAT: + return DataTypes.FLOAT().notNull(); + case DOUBLE: + return DataTypes.DOUBLE().notNull(); + case BOOLEAN: + return DataTypes.BOOLEAN().notNull(); + case NULL: + return DataTypes.NULL(); + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema) { + return convertToSchema(schema, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @param legacyTimestampMapping whether to use the legacy timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema, boolean legacyTimestampMapping) { + return convertToSchema( + schema, "org.apache.flink.avro.generated.record", legacyTimestampMapping); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType logicalType, String rowName) { + return convertToSchema(logicalType, rowName, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @param legacyTimestampMapping whether to use legal timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema( + LogicalType logicalType, String rowName, boolean legacyTimestampMapping) { + int precision; + boolean nullable = logicalType.isNullable(); + switch (logicalType.getTypeRoot()) { + case NULL: + return SchemaBuilder.builder().nullType(); + case BOOLEAN: + Schema bool = SchemaBuilder.builder().booleanType(); + return nullable ? nullableSchema(bool) : bool; + case TINYINT: + case SMALLINT: + case INTEGER: + Schema integer = SchemaBuilder.builder().intType(); + return nullable ? nullableSchema(integer) : integer; + case BIGINT: + Schema bigint = SchemaBuilder.builder().longType(); + return nullable ? nullableSchema(bigint) : bigint; + case FLOAT: + Schema floatSchema = SchemaBuilder.builder().floatType(); + return nullable ? nullableSchema(floatSchema) : floatSchema; + case DOUBLE: + Schema doubleSchema = SchemaBuilder.builder().doubleType(); + return nullable ? nullableSchema(doubleSchema) : doubleSchema; + case CHAR: + case VARCHAR: + Schema str = SchemaBuilder.builder().stringType(); + return nullable ? nullableSchema(str) : str; + case BINARY: + case VARBINARY: + Schema binary = SchemaBuilder.builder().bytesType(); + return nullable ? nullableSchema(binary) : binary; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // use long to represents Timestamp + final TimestampType timestampType = (TimestampType) logicalType; + precision = timestampType.getPrecision(); + org.apache.avro.LogicalType avroLogicalType; + if (legacyTimestampMapping) { + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 3."); + } + } else { + if (precision <= 3) { + avroLogicalType = LogicalTypes.localTimestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.localTimestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support LOCAL TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + } + Schema timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } else { + final LocalZonedTimestampType localZonedTimestampType = + (LocalZonedTimestampType) logicalType; + precision = localZonedTimestampType.getPrecision(); + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.timestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + } + case DATE: + // use int to represents Date + Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(date) : date; + case TIME_WITHOUT_TIME_ZONE: + precision = ((TimeType) logicalType).getPrecision(); + if (precision > 3) { + throw new IllegalArgumentException( + "Avro does not support TIME type with precision: " + + precision + + ", it only supports precision less than 3."); + } + // use int to represents Time, we only support millisecond when deserialization + Schema time = LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(time) : time; + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + // store BigDecimal as byte[] + Schema decimal = + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema(SchemaBuilder.builder().bytesType()); + return nullable ? nullableSchema(decimal) : decimal; + case ROW: + RowType rowType = (RowType) logicalType; + List fieldNames = rowType.getFieldNames(); + // we have to make sure the record name is different in a Schema + SchemaBuilder.FieldAssembler builder = + SchemaBuilder.builder().record(rowName).fields(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + String fieldName = fieldNames.get(i); + LogicalType fieldType = rowType.getTypeAt(i); + SchemaBuilder.GenericDefault fieldBuilder = + builder + .name(fieldName) + .type( + convertToSchema( + fieldType, rowName + "_" + fieldName, legacyTimestampMapping)); + + if (fieldType.isNullable()) { + builder = fieldBuilder.withDefault(null); + } else { + builder = fieldBuilder.noDefault(); + } + } + Schema record = builder.endRecord(); + return nullable ? nullableSchema(record) : record; + case MULTISET: + case MAP: + Schema map = + SchemaBuilder.builder() + .map() + .values(convertToSchema(extractValueTypeToAvroMap(logicalType), rowName)); + return nullable ? nullableSchema(map) : map; + case ARRAY: + ArrayType arrayType = (ArrayType) logicalType; + Schema array = + SchemaBuilder.builder() + .array() + .items(convertToSchema(arrayType.getElementType(), rowName)); + return nullable ? nullableSchema(array) : array; + case RAW: + default: + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } + } + + public static LogicalType extractValueTypeToAvroMap(LogicalType type) { + LogicalType keyType; + LogicalType valueType; + if (type instanceof MapType) { + MapType mapType = (MapType) type; + keyType = mapType.getKeyType(); + valueType = mapType.getValueType(); + } else { + MultisetType multisetType = (MultisetType) type; + keyType = multisetType.getElementType(); + valueType = new IntType(); + } + if (!keyType.is(LogicalTypeFamily.CHARACTER_STRING)) { + throw new UnsupportedOperationException( + "Avro format doesn't support non-string as key type of map. " + + "The key type is: " + + keyType.asSummaryString()); + } + return valueType; + } + + /** Returns schema with nullable true. */ + private static Schema nullableSchema(Schema schema) { + return schema.isNullable() + ? schema + : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java index f7e8e0c884cf..5f3494330cfc 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -21,14 +21,14 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.formats.avro.AvroToRowDataConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This util class converts Avro GenericRecord to Flink RowData.
      diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java index 8ef1f1fbb833..d74b8b9d620f 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -23,8 +23,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; @@ -32,6 +30,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This is not serializable because Avro {@link Schema} is not actually serializable, even though it diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java index b158b0871a53..cfef780a4daa 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java @@ -21,8 +21,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; @@ -31,6 +29,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; public class AvroGenericRecordConverter implements RowDataConverter { private final Schema avroSchema; diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java index e2cd411d7069..795c4fa5a766 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -75,6 +75,11 @@ public static class Primitives implements DataGenerator { OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_MAX_NANO = + OffsetDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_MAX_NANO = + LocalDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807); + private static final long ICEBERG_MAX_NANOS_EPOCH = 9223372036854775807L; private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); @@ -96,7 +101,11 @@ public static class Primitives implements DataGenerator { Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16)), + Types.NestedField.required( + 16, "ts_ns_with_zone_field", Types.TimestampNanoType.withZone()), + Types.NestedField.required( + 17, "ts_ns_without_zone_field", Types.TimestampNanoType.withoutZone())); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -171,6 +180,8 @@ public GenericRecord generateIcebergGenericRecord() { genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + genericRecord.setField("ts_ns_with_zone_field", JAVA_OFFSET_DATE_TIME_MAX_NANO); + genericRecord.setField("ts_ns_without_zone_field", JAVA_LOCAL_DATE_TIME_MAX_NANO); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -220,7 +231,11 @@ public GenericRowData generateFlinkRowData() { uuidBytes, binaryBytes, DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); + FIXED_BYTES, + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000)), + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000))); } @Override @@ -236,10 +251,12 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + // Now that AvroToRowDataConverters correctly supports microseconds, + // we must inject correct microsecond scale values into the Avro data. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_ns_with_zone_field", ICEBERG_MAX_NANOS_EPOCH); + genericRecord.put("ts_ns_without_zone_field", ICEBERG_MAX_NANOS_EPOCH); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -554,7 +571,11 @@ public static class ArrayOfPrimitive implements DataGenerator { new Schema( Types.NestedField.required(1, "row_id", Types.StringType.get()), Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "array_of_ts_ns", + Types.ListType.ofRequired(102, Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -581,13 +602,33 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + TimestampData[] tsArr = { + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), (int) Math.floorMod(posNanos, 1_000_000L)), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), (int) Math.floorMod(negNanos, 1_000_000L)) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(arr), + new GenericArrayData(tsArr)); } @Override @@ -595,6 +636,14 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } } @@ -808,7 +857,12 @@ public static class MapOfPrimitives implements DataGenerator { 2, "map_of_primitives", Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + 101, 102, Types.StringType.get(), Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "map_of_ts_ns", + Types.MapType.ofRequired( + 103, 104, Types.StringType.get(), Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -835,15 +889,37 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + return GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("positive"), + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), + (int) Math.floorMod(posNanos, 1_000_000L)), + StringData.fromString("negative"), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), + (int) Math.floorMod(negNanos, 1_000_000L))))); } @Override @@ -851,6 +927,15 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index abcc2d1da199..0e7635a33e87 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -30,7 +30,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.data.RandomRowData; import org.apache.iceberg.util.StructLikeWrapper; -import org.junit.jupiter.api.Disabled; public class TestRowDataWrapper extends RecordWrapperTestBase { @@ -92,16 +91,4 @@ protected void generateAndValidate( assertThat(actual).isExhausted(); assertThat(expected).isExhausted(); } - - @Disabled - @Override - public void testTimestampNanoWithoutZone() { - // Flink does not support nanosecond timestamp without zone. - } - - @Disabled - @Override - public void testTimestampNanoWithZone() { - // Flink does not support nanosecond timestamp with zone. - } } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 4a70802f2a2e..b7b0a54156cc 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -49,6 +49,11 @@ protected boolean allowsWritingNullValuesForRequiredFields() { return true; } + @Override + protected boolean supportsTimestampNanos() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 4e5b38ffb026..a2411da1e344 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -271,18 +271,19 @@ public void testMapOfPrimitivesProjection() { GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2)), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); + GenericRowData.of(StringData.fromString("other_row_id_value"), null, null); testEqualsAndHashCode( schema, idOnly, @@ -432,7 +433,8 @@ public void testArrayOfPrimitiveProjection() { GenericRowData otherRowData = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); + new GenericArrayData(new Integer[] {4, 5, 6}), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); @@ -440,16 +442,19 @@ public void testArrayOfPrimitiveProjection() { GenericRowData rowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); GenericRowData copyRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); + new GenericArrayData(new Integer[] {4, null, 6}), + null); testEqualsAndHashCode( schema, idOnly, From 96d556b8b31ac9e9c2c5ed35a5972e8d040499c3 Mon Sep 17 00:00:00 2001 From: drexler-sky Date: Thu, 30 Apr 2026 09:54:23 -0700 Subject: [PATCH 136/197] Spark 4.1: Migrate SparkWriteBuilder to SupportsOverwriteV2 (#16164) --- .../iceberg/spark/source/SparkWriteBuilder.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index 4858f7793c69..79730cd63d4b 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,10 +27,10 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.spark.source.SparkWriteBuilder.Mode.Append; @@ -39,20 +39,20 @@ import org.apache.iceberg.spark.source.SparkWriteBuilder.Mode.OverwriteByFilter; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final String branch; @@ -91,9 +91,9 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState(mode == null, "Cannot use overwrite by filter with other modes"); - Expression expr = SparkFilters.convert(filters); + Expression expr = SparkV2Filters.convert(predicates); this.mode = useDynamicOverwrite(expr) ? new DynamicOverwrite() : new OverwriteByFilter(expr); return this; } From 185da6b299edcf087b32f0b8d09249427c7ecc72 Mon Sep 17 00:00:00 2001 From: hemanthboyina Date: Fri, 1 May 2026 00:42:15 +0530 Subject: [PATCH 137/197] Core: Avoid unnecessary manifest scanning during snapshot expiration incremental cleanup (#16077) --- .../apache/iceberg/FileCleanupStrategy.java | 3 +- .../apache/iceberg/TestRemoveSnapshots.java | 43 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java index b55280a6537f..dd92d33cda79 100644 --- a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java +++ b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java @@ -84,7 +84,8 @@ public abstract void cleanFiles( "manifest_length", "partition_spec_id", "added_snapshot_id", - "deleted_data_files_count"); + "added_files_count", + "deleted_files_count"); protected CloseableIterable readManifests(Snapshot snapshot) { if (snapshot.manifestListLocation() != null) { diff --git a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java index 384b7132ef76..09e9fdd1f722 100644 --- a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java +++ b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java @@ -2207,4 +2207,47 @@ private static PartitionStatisticsFile reusePartitionStatsFile( private static void commitPartitionStats(Table table, PartitionStatisticsFile statisticsFile) { table.updatePartitionStatistics().setPartitionStatistics(statisticsFile).commit(); } + + @TestTemplate + public void testAppendOnlyManifestsNotScannedDuringCleanup() { + assumeThat(incrementalCleanup).isTrue(); + + TestTables.LocalFileIO spyFileIO = Mockito.spy(new TestTables.LocalFileIO()); + String tableName = "testAppendOnlyManifests"; + Table testTable = + TestTables.create( + tableDir, + tableName, + SCHEMA, + SPEC, + SortOrder.unsorted(), + formatVersion, + new TestTables.TestTableOperations(tableName, tableDir, spyFileIO)); + + testTable.newAppend().appendFile(FILE_A).commit(); + Snapshot firstSnapshot = testTable.currentSnapshot(); + + Set appendOnlyManifestPaths = + firstSnapshot.allManifests(testTable.io()).stream() + .map(ManifestFile::path) + .collect(Collectors.toSet()); + + waitUntilAfter(firstSnapshot.timestampMillis()); + + testTable.newAppend().appendFile(FILE_B).commit(); + long tAfterCommits = waitUntilAfter(testTable.currentSnapshot().timestampMillis()); + + Mockito.clearInvocations(spyFileIO); + + Set deletedFiles = Sets.newHashSet(); + removeSnapshots(testTable) + .expireOlderThan(tAfterCommits) + .deleteWith(deletedFiles::add) + .commit(); + + assertThat(deletedFiles).containsExactly(firstSnapshot.manifestListLocation()); + + appendOnlyManifestPaths.forEach( + path -> Mockito.verify(spyFileIO, Mockito.never()).newInputFile(path)); + } } From 6ce50265b0b0b364491314f00606db1fe857ab15 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 30 Apr 2026 17:43:35 -0400 Subject: [PATCH 138/197] AWS: Fix stale LICENSE entry for Parquet, clarify failsafe attribution (#16179) Co-authored-by: Copilot --- aws-bundle/LICENSE | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/aws-bundle/LICENSE b/aws-bundle/LICENSE index 997d9652a873..46174e959393 100644 --- a/aws-bundle/LICENSE +++ b/aws-bundle/LICENSE @@ -217,9 +217,41 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Apache Parquet. +This product bundles Apache Parquet (bundled by AWS Analytics Accelerator S3). -Project URL: https://parquet.apache.org +Copyright: 2014-2024 The Apache Software Foundation +Project URL: https://parquet.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Apache Thrift (bundled by Parquet). + +Copyright: 2006-2017 The Apache Software Foundation. +Project URL: https://thrift.apache.org/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). + +Copyright: 2013 Daniel Lemire +Project URL: https://github.com/lemire/JavaFastPFOR +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles fastutil (bundled by Parquet). + +Copyright: 2002-2014 Sebastiano Vigna +Project URL: http://fastutil.di.unimi.it/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Zero-Allocation Hashing (bundled by Parquet). + +Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -295,7 +327,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles failsafe. +This product bundles failsafe (bundled by AWS Analytics Accelerator S3). Copyright: Jonathan Halterman and friends Project URL: https://failsafe.dev/ From 3f14731e23a7fbe6e3357bf40cd1a4ccc2f35a81 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Fri, 1 May 2026 07:45:57 -0700 Subject: [PATCH 139/197] Open API: Remove runtime Jar from build and deploy (#16163) --- deploy.gradle | 1 - 1 file changed, 1 deletion(-) diff --git a/deploy.gradle b/deploy.gradle index 740d0056273b..65836bf1b3f1 100644 --- a/deploy.gradle +++ b/deploy.gradle @@ -75,7 +75,6 @@ subprojects { } else if (isOpenApi) { artifact testJar artifact testFixturesJar - artifact shadowJar } else { if (tasks.matching({task -> task.name == 'shadowJar'}).isEmpty()) { from components.java From 53f1f1a0a7fbdb013676358e8198f2cbd88ef296 Mon Sep 17 00:00:00 2001 From: drexler-sky Date: Fri, 1 May 2026 07:50:18 -0700 Subject: [PATCH 140/197] Spark 3.4, 3.5, 4.0: Migrate SparkWriteBuilder to SupportsOverwriteV2 (#16178) --- .../iceberg/spark/source/SparkWriteBuilder.java | 12 ++++++------ .../iceberg/spark/source/SparkWriteBuilder.java | 12 ++++++------ .../iceberg/spark/source/SparkWriteBuilder.java | 12 ++++++------ 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index e608a40b72ad..df4566da0c90 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,27 +27,27 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final SparkWriteConf writeConf; @@ -100,12 +100,12 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState( !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); - this.overwriteExpr = SparkFilters.convert(filters); + this.overwriteExpr = SparkV2Filters.convert(predicates); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index e608a40b72ad..df4566da0c90 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,27 +27,27 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final SparkWriteConf writeConf; @@ -100,12 +100,12 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState( !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); - this.overwriteExpr = SparkFilters.convert(filters); + this.overwriteExpr = SparkV2Filters.convert(predicates); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java index 89af7740d988..c1867433fd8d 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkWriteBuilder.java @@ -27,27 +27,27 @@ import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.spark.SparkFilters; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkUtil; +import org.apache.iceberg.spark.SparkV2Filters; import org.apache.iceberg.spark.SparkWriteConf; import org.apache.iceberg.spark.SparkWriteRequirements; import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.Scan; import org.apache.spark.sql.connector.write.BatchWrite; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.RowLevelOperation.Command; import org.apache.spark.sql.connector.write.SupportsDynamicOverwrite; -import org.apache.spark.sql.connector.write.SupportsOverwrite; +import org.apache.spark.sql.connector.write.SupportsOverwriteV2; import org.apache.spark.sql.connector.write.Write; import org.apache.spark.sql.connector.write.WriteBuilder; import org.apache.spark.sql.connector.write.streaming.StreamingWrite; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.StructType; -class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwrite { +class SparkWriteBuilder implements WriteBuilder, SupportsDynamicOverwrite, SupportsOverwriteV2 { private final SparkSession spark; private final Table table; private final SparkWriteConf writeConf; @@ -100,12 +100,12 @@ public WriteBuilder overwriteDynamicPartitions() { } @Override - public WriteBuilder overwrite(Filter[] filters) { + public WriteBuilder overwrite(Predicate[] predicates) { Preconditions.checkState( !overwriteFiles, "Cannot overwrite individual files and using filters"); Preconditions.checkState(rewrittenFileSetId == null, "Cannot overwrite and rewrite"); - this.overwriteExpr = SparkFilters.convert(filters); + this.overwriteExpr = SparkV2Filters.convert(predicates); if (overwriteExpr == Expressions.alwaysTrue() && "dynamic".equals(overwriteMode)) { // use the write option to override truncating the table. use dynamic overwrite instead. this.overwriteDynamic = true; From 737f043f53b695a387b3e22745a065945ca63256 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 1 May 2026 17:35:02 -0700 Subject: [PATCH 141/197] Build: Bump datamodel-code-generator from 0.56.0 to 0.56.1 (#16114) Bumps [datamodel-code-generator](https://github.com/koxudaxi/datamodel-code-generator) from 0.56.0 to 0.56.1. - [Release notes](https://github.com/koxudaxi/datamodel-code-generator/releases) - [Changelog](https://github.com/koxudaxi/datamodel-code-generator/blob/main/CHANGELOG.md) - [Commits](https://github.com/koxudaxi/datamodel-code-generator/compare/0.56.0...0.56.1) --- updated-dependencies: - dependency-name: datamodel-code-generator dependency-version: 0.56.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- open-api/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open-api/requirements.txt b/open-api/requirements.txt index 1e19f4b303a7..3a00012d4ad1 100644 --- a/open-api/requirements.txt +++ b/open-api/requirements.txt @@ -16,5 +16,5 @@ # under the License. openapi-spec-validator==0.8.4 -datamodel-code-generator==0.56.0 +datamodel-code-generator==0.56.1 yamllint==1.38.0 From 128f656c0ecc908aaf6d08744f887cdf2f7d3876 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sat, 2 May 2026 14:31:47 -0400 Subject: [PATCH 142/197] AWS: remove extra/staled LICENSE entry bundled by Parquet (#16180) --- aws-bundle/LICENSE | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/aws-bundle/LICENSE b/aws-bundle/LICENSE index 46174e959393..f34a7e250c39 100644 --- a/aws-bundle/LICENSE +++ b/aws-bundle/LICENSE @@ -233,29 +233,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product includes code from Daniel Lemire's JavaFastPFOR project (bundled by Parquet). - -Copyright: 2013 Daniel Lemire -Project URL: https://github.com/lemire/JavaFastPFOR -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles fastutil (bundled by Parquet). - -Copyright: 2002-2014 Sebastiano Vigna -Project URL: http://fastutil.di.unimi.it/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Zero-Allocation Hashing (bundled by Parquet). - -Project URL: https://github.com/OpenHFT/Zero-Allocation-Hashing -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles Netty. Project URL: https://netty.io/ From 76283899d2c671082b402cd2bb476a29c77ba009 Mon Sep 17 00:00:00 2001 From: Prashant Singh <35593236+singhpk234@users.noreply.github.com> Date: Sat, 2 May 2026 15:34:51 -0700 Subject: [PATCH 143/197] Core: Propagate server error message in failed remote scan planning responses (#16024) --- .../apache/iceberg/rest/RESTTableScan.java | 45 ++++++--- .../rest/responses/ErrorResponseParser.java | 7 +- .../FetchPlanningResultResponse.java | 24 ++++- .../FetchPlanningResultResponseParser.java | 12 +++ .../rest/responses/PlanTableScanResponse.java | 18 ++++ .../PlanTableScanResponseParser.java | 11 +++ .../iceberg/rest/TestRESTScanPlanning.java | 97 +++++++++++++++++++ ...TestFetchPlanningResultResponseParser.java | 66 +++++++++++++ .../TestPlanTableScanResponseParser.java | 68 +++++++++++++ 9 files changed, 331 insertions(+), 17 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java index 2a39bf1105d8..b26e60481b34 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java @@ -48,6 +48,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.rest.credentials.Credential; import org.apache.iceberg.rest.requests.PlanTableScanRequest; +import org.apache.iceberg.rest.responses.ErrorResponse; import org.apache.iceberg.rest.responses.FetchPlanningResultResponse; import org.apache.iceberg.rest.responses.PlanTableScanResponse; import org.apache.iceberg.types.TypeUtil; @@ -218,11 +219,7 @@ private CloseableIterable planTableScan(PlanTableScanRequest planT Endpoint.check(supportedEndpoints, Endpoint.V1_FETCH_TABLE_SCAN_PLAN); return fetchPlanningResult(); case FAILED: - throw new IllegalStateException( - String.format("Received status: %s for planId: %s", PlanStatus.FAILED, planId)); - case CANCELLED: - throw new IllegalStateException( - String.format("Received status: %s for planId: %s", PlanStatus.CANCELLED, planId)); + throw new IllegalStateException(failureMessage(planId, response.errorResponse())); default: throw new IllegalStateException( String.format("Invalid planStatus: %s for planId: %s", planStatus, planId)); @@ -284,15 +281,26 @@ private CloseableIterable fetchPlanningResult() { ErrorHandlers.planErrorHandler(), parserContext); - if (response.planStatus() == PlanStatus.SUBMITTED) { - throw new NotCompleteException(); - } else if (response.planStatus() != PlanStatus.COMPLETED) { - throw new IllegalStateException( - String.format( - "Invalid planStatus: %s for planId: %s", response.planStatus(), id)); + switch (response.planStatus()) { + case COMPLETED: + result.set(response); + break; + case SUBMITTED: + throw new NotCompleteException(); + case FAILED: + throw new IllegalStateException(failureMessage(id, response.errorResponse())); + case CANCELLED: + throw new IllegalStateException( + String.format( + Locale.ROOT, "Remote scan planning cancelled for planId: %s", id)); + default: + throw new IllegalStateException( + String.format( + Locale.ROOT, + "Invalid planStatus: %s for planId: %s", + response.planStatus(), + id)); } - - result.set(response); }); } catch (NotCompleteException e) { throw new RemotePlanTimeoutException( @@ -314,6 +322,17 @@ private CloseableIterable fetchPlanningResult() { return scanTasksIterable(response.planTasks(), response.fileScanTasks()); } + private static String failureMessage(String planId, ErrorResponse error) { + Preconditions.checkArgument(error != null, "Error must be present for failed status"); + return String.format( + Locale.ROOT, + "Remote scan planning failed for planId: %s: %s (code=%d): %s", + planId, + error.type(), + error.code(), + error.message()); + } + private CloseableIterable scanTasksIterable( List planTasks, List fileScanTasks) { if (planTasks != null && !planTasks.isEmpty()) { diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java index 31ad0573b107..1329c074ab29 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/ErrorResponseParser.java @@ -46,9 +46,12 @@ public static String toJson(ErrorResponse errorResponse, boolean pretty) { public static void toJson(ErrorResponse errorResponse, JsonGenerator generator) throws IOException { generator.writeStartObject(); + writeError(errorResponse, generator); + generator.writeEndObject(); + } + static void writeError(ErrorResponse errorResponse, JsonGenerator generator) throws IOException { generator.writeObjectFieldStart(ERROR); - generator.writeStringField(MESSAGE, errorResponse.message()); generator.writeStringField(TYPE, errorResponse.type()); generator.writeNumberField(CODE, errorResponse.code()); @@ -57,8 +60,6 @@ public static void toJson(ErrorResponse errorResponse, JsonGenerator generator) } generator.writeEndObject(); - - generator.writeEndObject(); } /** diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java index 59db196244f5..2e176aac653f 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponse.java @@ -31,10 +31,12 @@ public class FetchPlanningResultResponse extends BaseScanTaskResponse { private final PlanStatus planStatus; + private final ErrorResponse errorResponse; private final List credentials; private FetchPlanningResultResponse( PlanStatus planStatus, + ErrorResponse errorResponse, List planTasks, List fileScanTasks, List deleteFiles, @@ -42,6 +44,7 @@ private FetchPlanningResultResponse( List credentials) { super(planTasks, fileScanTasks, deleteFiles, specsById); this.planStatus = planStatus; + this.errorResponse = errorResponse; this.credentials = credentials; validate(); } @@ -50,6 +53,10 @@ public PlanStatus planStatus() { return planStatus; } + public ErrorResponse errorResponse() { + return errorResponse; + } + public List credentials() { return credentials != null ? credentials : ImmutableList.of(); } @@ -64,6 +71,9 @@ public void validate() { Preconditions.checkArgument( planStatus() == PlanStatus.COMPLETED || (planTasks() == null && fileScanTasks() == null), "Invalid response: tasks can only be returned in a 'completed' status"); + Preconditions.checkArgument( + planStatus() == PlanStatus.FAILED || errorResponse() == null, + "Invalid response: error can only be returned in a 'failed' status"); if (fileScanTasks() == null || fileScanTasks().isEmpty()) { Preconditions.checkArgument( (deleteFiles() == null || deleteFiles().isEmpty()), @@ -76,6 +86,7 @@ public static class Builder private Builder() {} private PlanStatus planStatus; + private ErrorResponse errorResponse; private final List credentials = Lists.newArrayList(); public Builder withPlanStatus(PlanStatus status) { @@ -83,6 +94,11 @@ public Builder withPlanStatus(PlanStatus status) { return this; } + public Builder withErrorResponse(ErrorResponse response) { + this.errorResponse = response; + return this; + } + public Builder withCredentials(List credentialsToAdd) { credentials.addAll(credentialsToAdd); return this; @@ -91,7 +107,13 @@ public Builder withCredentials(List credentialsToAdd) { @Override public FetchPlanningResultResponse build() { return new FetchPlanningResultResponse( - planStatus, planTasks(), fileScanTasks(), deleteFiles(), specsById(), credentials); + planStatus, + errorResponse, + planTasks(), + fileScanTasks(), + deleteFiles(), + specsById(), + credentials); } } } diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java index 4a523d3c023b..aa74049ab9f0 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/FetchPlanningResultResponseParser.java @@ -38,6 +38,7 @@ public class FetchPlanningResultResponseParser { private static final String STATUS = "status"; private static final String PLAN_TASKS = "plan-tasks"; private static final String STORAGE_CREDENTIALS = "storage-credentials"; + private static final String ERROR = "error"; private FetchPlanningResultResponseParser() {} @@ -58,6 +59,11 @@ public static void toJson(FetchPlanningResultResponse response, JsonGenerator ge "Cannot serialize fileScanTasks in fetchingPlanningResultResponse without specsById"); gen.writeStartObject(); gen.writeStringField(STATUS, response.planStatus().status()); + + if (response.errorResponse() != null) { + ErrorResponseParser.writeError(response.errorResponse(), gen); + } + if (response.planTasks() != null) { JsonUtil.writeStringArray(PLAN_TASKS, response.planTasks(), gen); } @@ -90,6 +96,11 @@ public static FetchPlanningResultResponse fromJson( json != null && !json.isEmpty(), "Invalid fetchPlanningResult response: null or empty"); PlanStatus planStatus = PlanStatus.fromName(JsonUtil.getString(STATUS, json)); + ErrorResponse errorResponse = null; + if (json.has(ERROR) && json.get(ERROR).isObject()) { + errorResponse = ErrorResponseParser.fromJson(json); + } + List planTasks = JsonUtil.getStringListOrNull(PLAN_TASKS, json); List deleteFiles = TableScanResponseParser.parseDeleteFiles(json, specsById); List fileScanTasks = @@ -98,6 +109,7 @@ public static FetchPlanningResultResponse fromJson( FetchPlanningResultResponse.Builder builder = FetchPlanningResultResponse.builder() .withPlanStatus(planStatus) + .withErrorResponse(errorResponse) .withPlanTasks(planTasks) .withFileScanTasks(fileScanTasks) .withSpecsById(specsById); diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java index 1b4bb86e65eb..d0ac222c3052 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponse.java @@ -33,11 +33,13 @@ public class PlanTableScanResponse extends BaseScanTaskResponse { private final PlanStatus planStatus; private final String planId; + private final ErrorResponse errorResponse; private final List credentials; private PlanTableScanResponse( PlanStatus planStatus, String planId, + ErrorResponse errorResponse, List planTasks, List fileScanTasks, List deleteFiles, @@ -46,6 +48,7 @@ private PlanTableScanResponse( super(planTasks, fileScanTasks, deleteFiles, specsById); this.planStatus = planStatus; this.planId = planId; + this.errorResponse = errorResponse; this.credentials = credentials; validate(); } @@ -58,6 +61,10 @@ public String planId() { return planId; } + public ErrorResponse errorResponse() { + return errorResponse; + } + public List credentials() { return credentials != null ? credentials : ImmutableList.of(); } @@ -86,6 +93,10 @@ public void validate() { planStatus() == PlanStatus.COMPLETED || (planTasks() == null && fileScanTasks() == null), "Invalid response: tasks can only be defined when status is '%s'", PlanStatus.COMPLETED.status()); + Preconditions.checkArgument( + planStatus() == PlanStatus.FAILED || errorResponse() == null, + "Invalid response: error can only be defined when status is '%s'", + PlanStatus.FAILED.status()); if (null != planId()) { Preconditions.checkArgument( planStatus() == PlanStatus.SUBMITTED || planStatus() == PlanStatus.COMPLETED, @@ -108,6 +119,7 @@ public static Builder builder() { public static class Builder extends BaseScanTaskResponse.Builder { private PlanStatus planStatus; private String planId; + private ErrorResponse errorResponse; private final List credentials = Lists.newArrayList(); /** @@ -127,6 +139,11 @@ public Builder withPlanId(String id) { return this; } + public Builder withErrorResponse(ErrorResponse response) { + this.errorResponse = response; + return this; + } + public Builder withCredentials(List credentialsToAdd) { credentials.addAll(credentialsToAdd); return this; @@ -137,6 +154,7 @@ public PlanTableScanResponse build() { return new PlanTableScanResponse( planStatus, planId, + errorResponse, planTasks(), fileScanTasks(), deleteFiles(), diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java index c2f47b86d3f0..8ca199397ea6 100644 --- a/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java +++ b/core/src/main/java/org/apache/iceberg/rest/responses/PlanTableScanResponseParser.java @@ -39,6 +39,7 @@ public class PlanTableScanResponseParser { private static final String PLAN_ID = "plan-id"; private static final String PLAN_TASKS = "plan-tasks"; private static final String STORAGE_CREDENTIALS = "storage-credentials"; + private static final String ERROR = "error"; private PlanTableScanResponseParser() {} @@ -60,6 +61,10 @@ public static void toJson(PlanTableScanResponse response, JsonGenerator gen) thr gen.writeStartObject(); gen.writeStringField(STATUS, response.planStatus().status()); + if (response.errorResponse() != null) { + ErrorResponseParser.writeError(response.errorResponse(), gen); + } + if (response.planId() != null) { gen.writeStringField(PLAN_ID, response.planId()); } @@ -98,6 +103,11 @@ public static PlanTableScanResponse fromJson( "Cannot parse planTableScan response from empty or null object"); PlanStatus planStatus = PlanStatus.fromName(JsonUtil.getString(STATUS, json)); + ErrorResponse errorResponse = null; + if (json.has(ERROR) && json.get(ERROR).isObject()) { + errorResponse = ErrorResponseParser.fromJson(json); + } + String planId = JsonUtil.getStringOrNull(PLAN_ID, json); List planTasks = JsonUtil.getStringListOrNull(PLAN_TASKS, json); List deleteFiles = TableScanResponseParser.parseDeleteFiles(json, specsById); @@ -108,6 +118,7 @@ public static PlanTableScanResponse fromJson( PlanTableScanResponse.builder() .withPlanId(planId) .withPlanStatus(planStatus) + .withErrorResponse(errorResponse) .withPlanTasks(planTasks) .withFileScanTasks(fileScanTasks) .withSpecsById(specsById); diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java index 6fc67727bf23..214edc6b901e 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java @@ -1284,6 +1284,103 @@ public void asyncPlanningRejectsInvalidTimeout() { .hasMessageContaining("must be positive"); } + @ParameterizedTest + @EnumSource(PlanningMode.class) + public void planningFailsWithServerError( + Function planMode) { + ErrorResponse serverError = + ErrorResponse.builder() + .withMessage("table too large to plan") + .withType("IllegalStateException") + .responseCode(500) + .build(); + + TestPlanningBehavior behavior = planMode.apply(TestPlanningBehavior.builder()).build(); + CatalogWithAdapter catalogWithAdapter = + catalogThatFailsPlanning(serverError, behavior, "test-planning-failed"); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "planning_failed_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + assertThatThrownBy(scan::planFiles) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Remote scan planning failed") + .hasMessageContaining(serverError.type()) + .hasMessageContaining("code=" + serverError.code()) + .hasMessageContaining(serverError.message()); + } + + private CatalogWithAdapter catalogThatFailsPlanning( + ErrorResponse serverError, TestPlanningBehavior behavior, String catalogName) { + List endpoints = + endpointsWithPlanning( + Endpoint.V1_SUBMIT_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN, + Endpoint.V1_CANCEL_TABLE_SCAN_PLAN, + Endpoint.V1_FETCH_TABLE_SCAN_PLAN_TASKS); + + RESTCatalogAdapter adapter = + Mockito.spy( + new RESTCatalogAdapter(backendCatalog) { + @Override + public T execute( + HTTPRequest request, + Class responseType, + Consumer errorHandler, + Consumer> responseHeaders, + ParserContext parserContext) { + if (ResourcePaths.config().equals(request.path())) { + return castResponse( + responseType, ConfigResponse.builder().withEndpoints(endpoints).build()); + } + T response = + super.execute( + request, responseType, errorHandler, responseHeaders, parserContext); + if (response instanceof LoadTableResponse) { + return castResponse( + responseType, + withPlanningMode( + (LoadTableResponse) response, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName())); + } + // Leave SUBMITTED untouched so async mode polls and hits the fetch below. + if (response instanceof PlanTableScanResponse planResp + && planResp.planStatus() == PlanStatus.COMPLETED) { + return castResponse( + responseType, + PlanTableScanResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(serverError) + .withSpecsById(planResp.specsById()) + .build()); + } + if (response instanceof FetchPlanningResultResponse) { + return castResponse( + responseType, + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(serverError) + .build()); + } + return response; + } + }); + + adapter.setPlanningBehavior(behavior); + + RESTCatalog catalog = + new RESTCatalog(SessionCatalog.SessionContext.createEmpty(), (config) -> adapter); + catalog.initialize( + catalogName, + ImmutableMap.of( + CatalogProperties.FILE_IO_IMPL, + "org.apache.iceberg.inmemory.InMemoryFileIO", + RESTCatalogProperties.SCAN_PLANNING_MODE, + RESTCatalogProperties.ScanPlanningMode.SERVER.modeName())); + return new CatalogWithAdapter(catalog, adapter); + } + @ParameterizedTest @EnumSource(PlanningMode.class) void fileIOForRemotePlanningIsPropagated( diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java index 5fdfdc281f4f..841083f88baf 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestFetchPlanningResultResponseParser.java @@ -330,4 +330,70 @@ public void roundTripSerdeWithCredentials() { assertThat(FetchPlanningResultResponseParser.toJson(copyResponse, true)) .isEqualTo(expectedJson); } + + @Test + public void roundTripSerdeWithFailedStatusAndErrorResponse() { + ErrorResponse errorResponse = + ErrorResponse.builder() + .withMessage("Scan planning failed: table too large to plan") + .withType("IllegalStateException") + .responseCode(500) + .build(); + + FetchPlanningResultResponse response = + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(errorResponse) + .build(); + + String expectedJson = + "{\"status\":\"failed\"," + + "\"error\":{\"message\":\"Scan planning failed: table too large to plan\"," + + "\"type\":\"IllegalStateException\",\"code\":500}}"; + String json = FetchPlanningResultResponseParser.toJson(response); + assertThat(json).isEqualTo(expectedJson); + + FetchPlanningResultResponse fromResponse = + FetchPlanningResultResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(fromResponse.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(fromResponse.errorResponse()).isNotNull(); + assertThat(fromResponse.errorResponse().message()) + .isEqualTo("Scan planning failed: table too large to plan"); + assertThat(fromResponse.errorResponse().type()).isEqualTo("IllegalStateException"); + assertThat(fromResponse.errorResponse().code()).isEqualTo(500); + } + + @Test + public void parseFailedStatusWithoutErrorObject() { + // Spec requires an `error` object on failed responses, but parse leniently so + // a non-compliant server still surfaces the failure to the client. + String json = "{\"status\":\"failed\"}"; + FetchPlanningResultResponse response = + FetchPlanningResultResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void parseFailedStatusWithPrimitiveErrorField() { + String json = "{\"status\":\"failed\",\"error\":\"oops\"}"; + FetchPlanningResultResponse response = + FetchPlanningResultResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void cannotBuildWithErrorResponseWhenStatusIsNotFailed() { + ErrorResponse errorResponse = + ErrorResponse.builder().withMessage("boom").withType("X").responseCode(500).build(); + assertThatThrownBy( + () -> + FetchPlanningResultResponse.builder() + .withPlanStatus(PlanStatus.COMPLETED) + .withErrorResponse(errorResponse) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid response: error can only be returned in a 'failed' status"); + } } diff --git a/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java index 454e838bcca2..6354e7bf246f 100644 --- a/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestPlanTableScanResponseParser.java @@ -648,4 +648,72 @@ public void roundTripSerdeWithValidStatusAndFileScanTasksAndCredentials() { assertThat(PlanTableScanResponseParser.toJson(copyResponse, true)).isEqualTo(expectedJson); } + + @Test + public void roundTripSerdeWithFailedStatusAndErrorResponse() { + ErrorResponse errorResponse = + ErrorResponse.builder() + .withMessage("Scan planning failed: table too large to plan") + .withType("IllegalStateException") + .responseCode(500) + .build(); + + PlanTableScanResponse response = + PlanTableScanResponse.builder() + .withPlanStatus(PlanStatus.FAILED) + .withErrorResponse(errorResponse) + .withSpecsById(PARTITION_SPECS_BY_ID) + .build(); + + String expectedJson = + "{\"status\":\"failed\"," + + "\"error\":{\"message\":\"Scan planning failed: table too large to plan\"," + + "\"type\":\"IllegalStateException\",\"code\":500}}"; + String json = PlanTableScanResponseParser.toJson(response); + assertThat(json).isEqualTo(expectedJson); + + PlanTableScanResponse fromResponse = + PlanTableScanResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(fromResponse.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(fromResponse.errorResponse()).isNotNull(); + assertThat(fromResponse.errorResponse().message()) + .isEqualTo("Scan planning failed: table too large to plan"); + assertThat(fromResponse.errorResponse().type()).isEqualTo("IllegalStateException"); + assertThat(fromResponse.errorResponse().code()).isEqualTo(500); + } + + @Test + public void parseFailedStatusWithoutErrorObject() { + // Spec requires an `error` object on failed responses, but parse leniently so + // a non-compliant server still surfaces the failure to the client. + String json = "{\"status\":\"failed\"}"; + PlanTableScanResponse response = + PlanTableScanResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void parseFailedStatusWithPrimitiveErrorField() { + String json = "{\"status\":\"failed\",\"error\":\"oops\"}"; + PlanTableScanResponse response = + PlanTableScanResponseParser.fromJson(json, PARTITION_SPECS_BY_ID, false); + assertThat(response.planStatus()).isEqualTo(PlanStatus.FAILED); + assertThat(response.errorResponse()).isNull(); + } + + @Test + public void cannotBuildWithErrorResponseWhenStatusIsNotFailed() { + ErrorResponse errorResponse = + ErrorResponse.builder().withMessage("boom").withType("X").responseCode(500).build(); + assertThatThrownBy( + () -> + PlanTableScanResponse.builder() + .withPlanStatus(PlanStatus.COMPLETED) + .withErrorResponse(errorResponse) + .withSpecsById(PARTITION_SPECS_BY_ID) + .build()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessage("Invalid response: error can only be defined when status is 'failed'"); + } } From 6d7ab339aad435d3050c05ed089b17aa508d7268 Mon Sep 17 00:00:00 2001 From: Prashant Singh <35593236+singhpk234@users.noreply.github.com> Date: Sat, 2 May 2026 20:49:19 -0700 Subject: [PATCH 144/197] Core: Surface failed scan planning even when server omits error payload (#16197) * Core: Surface failed scan planning even when server omits error payload Follow-up to #16024. The spec requires an ErrorResponse with a FAILED plan status, but if a server violates that, the client should still give the user a meaningful failure message rather than throw an IllegalArgumentException on top of an already-broken response. Replace the precondition check with per-field fallbacks ("unknown" / code 0), preserving the full message when the server conforms and degrading gracefully otherwise. Addresses https://github.com/apache/iceberg/pull/16024#discussion_r3177313116 * Core: Shorten lenient-failure comment per review feedback --------- Co-authored-by: Prashant Singh --- .../apache/iceberg/rest/RESTTableScan.java | 12 +++++++---- .../iceberg/rest/TestRESTScanPlanning.java | 21 +++++++++++++++++++ 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java index b26e60481b34..9fa273ca169f 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTTableScan.java @@ -323,14 +323,18 @@ private CloseableIterable fetchPlanningResult() { } private static String failureMessage(String planId, ErrorResponse error) { - Preconditions.checkArgument(error != null, "Error must be present for failed status"); + // If a FAILED response lacks the expected error payload, still return a useful error + // message instead of throwing. + String type = error != null ? error.type() : "unknown"; + int code = error != null ? error.code() : 0; + String message = error != null ? error.message() : "unknown"; return String.format( Locale.ROOT, "Remote scan planning failed for planId: %s: %s (code=%d): %s", planId, - error.type(), - error.code(), - error.message()); + type, + code, + message); } private CloseableIterable scanTasksIterable( diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java index 214edc6b901e..9b42d445f585 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTScanPlanning.java @@ -1311,6 +1311,27 @@ public void planningFailsWithServerError( .hasMessageContaining(serverError.message()); } + @ParameterizedTest + @EnumSource(PlanningMode.class) + public void planningFailsWithoutServerErrorIsStillSurfaced( + Function planMode) { + // Spec requires an error payload with a FAILED status; if a server violates that, + // the client must still surface a meaningful failure rather than throw on top of it. + TestPlanningBehavior behavior = planMode.apply(TestPlanningBehavior.builder()).build(); + CatalogWithAdapter catalogWithAdapter = + catalogThatFailsPlanning(null, behavior, "test-planning-failed-no-error"); + + RESTTable table = restTableFor(catalogWithAdapter.catalog, "planning_failed_no_error_test"); + setParserContext(table); + RESTTableScan scan = restTableScanFor(table); + + assertThatThrownBy(scan::planFiles) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("Remote scan planning failed") + .hasMessageContaining("unknown") + .hasMessageContaining("code=0"); + } + private CatalogWithAdapter catalogThatFailsPlanning( ErrorResponse serverError, TestPlanningBehavior behavior, String catalogName) { List endpoints = From 1c1aaf038442c490c9aa4617a5327d240d535bf8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 2 May 2026 23:04:27 -0700 Subject: [PATCH 145/197] Build: Bump openapi-spec-validator from 0.8.4 to 0.8.5 (#16200) Bumps [openapi-spec-validator](https://github.com/python-openapi/openapi-spec-validator) from 0.8.4 to 0.8.5. - [Release notes](https://github.com/python-openapi/openapi-spec-validator/releases) - [Commits](https://github.com/python-openapi/openapi-spec-validator/compare/0.8.4...0.8.5) --- updated-dependencies: - dependency-name: openapi-spec-validator dependency-version: 0.8.5 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- open-api/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open-api/requirements.txt b/open-api/requirements.txt index 3a00012d4ad1..4e75e426b537 100644 --- a/open-api/requirements.txt +++ b/open-api/requirements.txt @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -openapi-spec-validator==0.8.4 +openapi-spec-validator==0.8.5 datamodel-code-generator==0.56.1 yamllint==1.38.0 From bd96c74ae3602186b0f46dcaa361e76c1f274bcb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 2 May 2026 23:04:38 -0700 Subject: [PATCH 146/197] Build: Bump testcontainers from 2.0.4 to 2.0.5 (#16201) Bumps `testcontainers` from 2.0.4 to 2.0.5. Updates `org.testcontainers:testcontainers` from 2.0.4 to 2.0.5 - [Release notes](https://github.com/testcontainers/testcontainers-java/releases) - [Changelog](https://github.com/testcontainers/testcontainers-java/blob/main/CHANGELOG.md) - [Commits](https://github.com/testcontainers/testcontainers-java/compare/2.0.4...2.0.5) Updates `org.testcontainers:testcontainers-junit-jupiter` from 2.0.4 to 2.0.5 - [Release notes](https://github.com/testcontainers/testcontainers-java/releases) - [Changelog](https://github.com/testcontainers/testcontainers-java/blob/main/CHANGELOG.md) - [Commits](https://github.com/testcontainers/testcontainers-java/compare/2.0.4...2.0.5) Updates `org.testcontainers:testcontainers-minio` from 2.0.4 to 2.0.5 - [Release notes](https://github.com/testcontainers/testcontainers-java/releases) - [Changelog](https://github.com/testcontainers/testcontainers-java/blob/main/CHANGELOG.md) - [Commits](https://github.com/testcontainers/testcontainers-java/compare/2.0.4...2.0.5) --- updated-dependencies: - dependency-name: org.testcontainers:testcontainers dependency-version: 2.0.5 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.testcontainers:testcontainers-junit-jupiter dependency-version: 2.0.5 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.testcontainers:testcontainers-minio dependency-version: 2.0.5 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 8b24d4727436..ae7bf21b8f15 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -90,7 +90,7 @@ spark35 = "3.5.8" spark40 = "4.0.2" spark41 = "4.1.1" sqlite-jdbc = "3.53.0.0" -testcontainers = "2.0.4" +testcontainers = "2.0.5" tez08 = { strictly = "0.8.4"} # see rich version usage explanation above [libraries] From d69d92ce0b5d7cbbe7efb841da9c34bc5b913155 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 2 May 2026 23:04:54 -0700 Subject: [PATCH 147/197] Build: Bump nessie from 0.107.4 to 0.107.5 (#16202) Bumps `nessie` from 0.107.4 to 0.107.5. Updates `org.projectnessie.nessie:nessie-client` from 0.107.4 to 0.107.5 - [Release notes](https://github.com/projectnessie/nessie/releases) - [Changelog](https://github.com/projectnessie/nessie/blob/main/CHANGELOG.md) - [Commits](https://github.com/projectnessie/nessie/compare/nessie-0.107.4...nessie-0.107.5) Updates `org.projectnessie.nessie:nessie-jaxrs-testextension` from 0.107.4 to 0.107.5 - [Release notes](https://github.com/projectnessie/nessie/releases) - [Changelog](https://github.com/projectnessie/nessie/blob/main/CHANGELOG.md) - [Commits](https://github.com/projectnessie/nessie/compare/nessie-0.107.4...nessie-0.107.5) Updates `org.projectnessie.nessie:nessie-versioned-storage-inmemory-tests` from 0.107.4 to 0.107.5 - [Release notes](https://github.com/projectnessie/nessie/releases) - [Changelog](https://github.com/projectnessie/nessie/blob/main/CHANGELOG.md) - [Commits](https://github.com/projectnessie/nessie/compare/nessie-0.107.4...nessie-0.107.5) Updates `org.projectnessie.nessie:nessie-versioned-storage-testextension` from 0.107.4 to 0.107.5 - [Release notes](https://github.com/projectnessie/nessie/releases) - [Changelog](https://github.com/projectnessie/nessie/blob/main/CHANGELOG.md) - [Commits](https://github.com/projectnessie/nessie/compare/nessie-0.107.4...nessie-0.107.5) --- updated-dependencies: - dependency-name: org.projectnessie.nessie:nessie-client dependency-version: 0.107.5 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.projectnessie.nessie:nessie-jaxrs-testextension dependency-version: 0.107.5 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.projectnessie.nessie:nessie-versioned-storage-inmemory-tests dependency-version: 0.107.5 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.projectnessie.nessie:nessie-versioned-storage-testextension dependency-version: 0.107.5 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ae7bf21b8f15..e7b83a3f7cc3 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -76,7 +76,7 @@ lz4Java = "1.11.0" microprofile-openapi-api = "3.1.2" mockito = "4.11.0" mockserver = "5.15.0" -nessie = "0.107.4" +nessie = "0.107.5" netty-buffer = "4.2.12.Final" object-client-bundle = "3.3.2" orc = "1.9.8" From 8537152cdaa842536bea8f6498a03b04fe214f2f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 3 May 2026 08:06:02 -0700 Subject: [PATCH 148/197] Build: Bump org.apache.httpcomponents.client5:httpclient5 (#16204) Bumps [org.apache.httpcomponents.client5:httpclient5](https://github.com/apache/httpcomponents-client) from 5.6 to 5.6.1. - [Changelog](https://github.com/apache/httpcomponents-client/blob/rel/v5.6.1/RELEASE_NOTES.txt) - [Commits](https://github.com/apache/httpcomponents-client/compare/rel/v5.6...rel/v5.6.1) --- updated-dependencies: - dependency-name: org.apache.httpcomponents.client5:httpclient5 dependency-version: 5.6.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index e7b83a3f7cc3..d65397574063 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -55,7 +55,7 @@ google-libraries-bom = "26.80.0" gcs-analytics-core = "1.2.3" guava = "33.6.0-jre" hadoop3 = "3.4.3" -httpcomponents-httpclient5 = "5.6" +httpcomponents-httpclient5 = "5.6.1" hive2 = { strictly = "2.3.10"} # see rich version usage explanation above immutables-value = "2.12.1" jackson-annotations = "2.21" From 33e173c0d6afa96f86d9f2d6da4030d9c691b920 Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Mon, 4 May 2026 10:59:03 +0900 Subject: [PATCH 149/197] Build: Bump software.amazon.awssdk:bom from 2.42.36 to 2.42.41 (#16206) Bumps software.amazon.awssdk:bom from 2.42.36 to 2.42.41. --- updated-dependencies: - dependency-name: software.amazon.awssdk:bom dependency-version: 2.42.41 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- aws-bundle/runtime-deps.txt | 90 +++++++++---------- gradle/libs.versions.toml | 2 +- .../kafka-connect-runtime/runtime-deps.txt | 80 ++++++++--------- 3 files changed, 86 insertions(+), 86 deletions(-) diff --git a/aws-bundle/runtime-deps.txt b/aws-bundle/runtime-deps.txt index 730259fca97b..73c7e0ef16b9 100644 --- a/aws-bundle/runtime-deps.txt +++ b/aws-bundle/runtime-deps.txt @@ -16,51 +16,51 @@ org.apache.httpcomponents:httpclient:4.5.13 org.apache.httpcomponents:httpcore:4.4.16 org.checkerframework:checker-qual:3.19.0 org.reactivestreams:reactive-streams:1.0.4 -software.amazon.awssdk.crt:aws-crt:0.44.0 -software.amazon.awssdk:annotations:2.42.36 -software.amazon.awssdk:apache-client:2.42.36 -software.amazon.awssdk:arns:2.42.36 -software.amazon.awssdk:auth:2.42.36 -software.amazon.awssdk:aws-core:2.42.36 -software.amazon.awssdk:aws-json-protocol:2.42.36 -software.amazon.awssdk:aws-query-protocol:2.42.36 -software.amazon.awssdk:aws-xml-protocol:2.42.36 -software.amazon.awssdk:checksums-spi:2.42.36 -software.amazon.awssdk:checksums:2.42.36 -software.amazon.awssdk:cloudwatch-metric-publisher:2.42.36 -software.amazon.awssdk:cloudwatch:2.42.36 -software.amazon.awssdk:crt-core:2.42.36 -software.amazon.awssdk:dynamodb:2.42.36 -software.amazon.awssdk:endpoints-spi:2.42.36 -software.amazon.awssdk:glue:2.42.36 -software.amazon.awssdk:http-auth-aws-crt:2.42.36 -software.amazon.awssdk:http-auth-aws-eventstream:2.42.36 -software.amazon.awssdk:http-auth-aws:2.42.36 -software.amazon.awssdk:http-auth-spi:2.42.36 -software.amazon.awssdk:http-auth:2.42.36 -software.amazon.awssdk:http-client-spi:2.42.36 -software.amazon.awssdk:iam:2.42.36 -software.amazon.awssdk:identity-spi:2.42.36 -software.amazon.awssdk:json-utils:2.42.36 -software.amazon.awssdk:kms:2.42.36 -software.amazon.awssdk:lakeformation:2.42.36 -software.amazon.awssdk:metrics-spi:2.42.36 -software.amazon.awssdk:netty-nio-client:2.42.36 -software.amazon.awssdk:profiles:2.42.36 -software.amazon.awssdk:protocol-core:2.42.36 -software.amazon.awssdk:regions:2.42.36 -software.amazon.awssdk:retries-spi:2.42.36 -software.amazon.awssdk:retries:2.42.36 -software.amazon.awssdk:s3:2.42.36 -software.amazon.awssdk:s3control:2.42.36 -software.amazon.awssdk:sdk-core:2.42.36 -software.amazon.awssdk:smithy-rpcv2-protocol:2.42.36 -software.amazon.awssdk:sso:2.42.36 -software.amazon.awssdk:sts:2.42.36 -software.amazon.awssdk:third-party-jackson-core:2.42.36 -software.amazon.awssdk:third-party-jackson-dataformat-cbor:2.42.36 -software.amazon.awssdk:utils-lite:2.42.36 -software.amazon.awssdk:utils:2.42.36 +software.amazon.awssdk.crt:aws-crt:0.45.1 +software.amazon.awssdk:annotations:2.42.41 +software.amazon.awssdk:apache-client:2.42.41 +software.amazon.awssdk:arns:2.42.41 +software.amazon.awssdk:auth:2.42.41 +software.amazon.awssdk:aws-core:2.42.41 +software.amazon.awssdk:aws-json-protocol:2.42.41 +software.amazon.awssdk:aws-query-protocol:2.42.41 +software.amazon.awssdk:aws-xml-protocol:2.42.41 +software.amazon.awssdk:checksums-spi:2.42.41 +software.amazon.awssdk:checksums:2.42.41 +software.amazon.awssdk:cloudwatch-metric-publisher:2.42.41 +software.amazon.awssdk:cloudwatch:2.42.41 +software.amazon.awssdk:crt-core:2.42.41 +software.amazon.awssdk:dynamodb:2.42.41 +software.amazon.awssdk:endpoints-spi:2.42.41 +software.amazon.awssdk:glue:2.42.41 +software.amazon.awssdk:http-auth-aws-crt:2.42.41 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.41 +software.amazon.awssdk:http-auth-aws:2.42.41 +software.amazon.awssdk:http-auth-spi:2.42.41 +software.amazon.awssdk:http-auth:2.42.41 +software.amazon.awssdk:http-client-spi:2.42.41 +software.amazon.awssdk:iam:2.42.41 +software.amazon.awssdk:identity-spi:2.42.41 +software.amazon.awssdk:json-utils:2.42.41 +software.amazon.awssdk:kms:2.42.41 +software.amazon.awssdk:lakeformation:2.42.41 +software.amazon.awssdk:metrics-spi:2.42.41 +software.amazon.awssdk:netty-nio-client:2.42.41 +software.amazon.awssdk:profiles:2.42.41 +software.amazon.awssdk:protocol-core:2.42.41 +software.amazon.awssdk:regions:2.42.41 +software.amazon.awssdk:retries-spi:2.42.41 +software.amazon.awssdk:retries:2.42.41 +software.amazon.awssdk:s3:2.42.41 +software.amazon.awssdk:s3control:2.42.41 +software.amazon.awssdk:sdk-core:2.42.41 +software.amazon.awssdk:smithy-rpcv2-protocol:2.42.41 +software.amazon.awssdk:sso:2.42.41 +software.amazon.awssdk:sts:2.42.41 +software.amazon.awssdk:third-party-jackson-core:2.42.41 +software.amazon.awssdk:third-party-jackson-dataformat-cbor:2.42.41 +software.amazon.awssdk:utils-lite:2.42.41 +software.amazon.awssdk:utils:2.42.41 software.amazon.eventstream:eventstream:1.0.1 software.amazon.s3.accessgrants:aws-s3-accessgrants-java-plugin:2.4.1 software.amazon.s3.analyticsaccelerator:analyticsaccelerator-s3:1.3.1 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index d65397574063..c666281e6eae 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -33,7 +33,7 @@ arrow = "15.0.2" avro = "1.12.1" assertj-core = "3.27.7" awaitility = "4.3.0" -awssdk-bom = "2.42.36" +awssdk-bom = "2.42.41" azuresdk-bom = "1.3.6" awssdk-s3accessgrants = "2.4.1" bouncycastle = "1.84" diff --git a/kafka-connect/kafka-connect-runtime/runtime-deps.txt b/kafka-connect/kafka-connect-runtime/runtime-deps.txt index eb3e45769808..56a880cb6494 100644 --- a/kafka-connect/kafka-connect-runtime/runtime-deps.txt +++ b/kafka-connect/kafka-connect-runtime/runtime-deps.txt @@ -190,44 +190,44 @@ org.slf4j:slf4j-api:2.0.17 org.threeten:threeten-extra:1.8.0 org.threeten:threetenbp:1.7.0 org.xerial.snappy:snappy-java:1.1.10.8 -software.amazon.awssdk.crt:aws-crt:0.44.0 -software.amazon.awssdk:annotations:2.42.36 -software.amazon.awssdk:apache-client:2.42.36 -software.amazon.awssdk:arns:2.42.36 -software.amazon.awssdk:auth:2.42.36 -software.amazon.awssdk:aws-core:2.42.36 -software.amazon.awssdk:aws-json-protocol:2.42.36 -software.amazon.awssdk:aws-query-protocol:2.42.36 -software.amazon.awssdk:aws-xml-protocol:2.42.36 -software.amazon.awssdk:checksums-spi:2.42.36 -software.amazon.awssdk:checksums:2.42.36 -software.amazon.awssdk:crt-core:2.42.36 -software.amazon.awssdk:dynamodb:2.42.36 -software.amazon.awssdk:endpoints-spi:2.42.36 -software.amazon.awssdk:glue:2.42.36 -software.amazon.awssdk:http-auth-aws-crt:2.42.36 -software.amazon.awssdk:http-auth-aws-eventstream:2.42.36 -software.amazon.awssdk:http-auth-aws:2.42.36 -software.amazon.awssdk:http-auth-spi:2.42.36 -software.amazon.awssdk:http-auth:2.42.36 -software.amazon.awssdk:http-client-spi:2.42.36 -software.amazon.awssdk:iam:2.42.36 -software.amazon.awssdk:identity-spi:2.42.36 -software.amazon.awssdk:json-utils:2.42.36 -software.amazon.awssdk:kms:2.42.36 -software.amazon.awssdk:lakeformation:2.42.36 -software.amazon.awssdk:metrics-spi:2.42.36 -software.amazon.awssdk:netty-nio-client:2.42.36 -software.amazon.awssdk:profiles:2.42.36 -software.amazon.awssdk:protocol-core:2.42.36 -software.amazon.awssdk:regions:2.42.36 -software.amazon.awssdk:retries-spi:2.42.36 -software.amazon.awssdk:retries:2.42.36 -software.amazon.awssdk:s3:2.42.36 -software.amazon.awssdk:sdk-core:2.42.36 -software.amazon.awssdk:sso:2.42.36 -software.amazon.awssdk:sts:2.42.36 -software.amazon.awssdk:third-party-jackson-core:2.42.36 -software.amazon.awssdk:utils-lite:2.42.36 -software.amazon.awssdk:utils:2.42.36 +software.amazon.awssdk.crt:aws-crt:0.45.1 +software.amazon.awssdk:annotations:2.42.41 +software.amazon.awssdk:apache-client:2.42.41 +software.amazon.awssdk:arns:2.42.41 +software.amazon.awssdk:auth:2.42.41 +software.amazon.awssdk:aws-core:2.42.41 +software.amazon.awssdk:aws-json-protocol:2.42.41 +software.amazon.awssdk:aws-query-protocol:2.42.41 +software.amazon.awssdk:aws-xml-protocol:2.42.41 +software.amazon.awssdk:checksums-spi:2.42.41 +software.amazon.awssdk:checksums:2.42.41 +software.amazon.awssdk:crt-core:2.42.41 +software.amazon.awssdk:dynamodb:2.42.41 +software.amazon.awssdk:endpoints-spi:2.42.41 +software.amazon.awssdk:glue:2.42.41 +software.amazon.awssdk:http-auth-aws-crt:2.42.41 +software.amazon.awssdk:http-auth-aws-eventstream:2.42.41 +software.amazon.awssdk:http-auth-aws:2.42.41 +software.amazon.awssdk:http-auth-spi:2.42.41 +software.amazon.awssdk:http-auth:2.42.41 +software.amazon.awssdk:http-client-spi:2.42.41 +software.amazon.awssdk:iam:2.42.41 +software.amazon.awssdk:identity-spi:2.42.41 +software.amazon.awssdk:json-utils:2.42.41 +software.amazon.awssdk:kms:2.42.41 +software.amazon.awssdk:lakeformation:2.42.41 +software.amazon.awssdk:metrics-spi:2.42.41 +software.amazon.awssdk:netty-nio-client:2.42.41 +software.amazon.awssdk:profiles:2.42.41 +software.amazon.awssdk:protocol-core:2.42.41 +software.amazon.awssdk:regions:2.42.41 +software.amazon.awssdk:retries-spi:2.42.41 +software.amazon.awssdk:retries:2.42.41 +software.amazon.awssdk:s3:2.42.41 +software.amazon.awssdk:sdk-core:2.42.41 +software.amazon.awssdk:sso:2.42.41 +software.amazon.awssdk:sts:2.42.41 +software.amazon.awssdk:third-party-jackson-core:2.42.41 +software.amazon.awssdk:utils-lite:2.42.41 +software.amazon.awssdk:utils:2.42.41 software.amazon.eventstream:eventstream:1.0.1 From 334dd95535b4170fe8b2c75d48204ac3413a5cd6 Mon Sep 17 00:00:00 2001 From: Alexandre Dutra Date: Mon, 4 May 2026 10:30:55 +0200 Subject: [PATCH 150/197] Core, AWS: Adapt code to S3 signing endpoint promotion (#15451) * Core, AWS: Adapt code base to S3 signing endpoint promotion Dev ML discussion: https://lists.apache.org/thread/2kqdqb46j7jww36wwg4txv6pl2hqq9w7 This commit adapts the code base to the REST spec changes in #15450. Summary of changes: - Added new signer endpoint to `Endpoint` and `ResourcePaths` - Added new remote signing properties to `RESTCatalogProperties` - Introduced `RemoteSignRequest`, `RemoteSignRequestParser`, `RemoteSignResponse`, `RemoteSignResponseParser` - Deprecated `S3SignRequest`, `S3SignRequestParser`, `S3SignResponse`, `S3SignResponseParser` for removal - Deprecated `S3ObjectMapper` for removal - Added new serializers to `RESTSerializers` - Adapted `S3V4RestSignerClient`: - Deprecated public fields - Changed access methods and `check()` method to account for new properties and deprecated ones. - Included new `provider` request body parameter Test changes: - Refactored `S3SignerServlet` to extract a parent abstract class, `RemoteSignerServlet` (it can now be reused to test other providers) - Moved JSON parser tests from AWS module to Core module - Enhanced `TestS3V4RestSignerClient` --- .../aws/s3/signer/S3SignerServlet.java | 198 +++-------------- .../aws/s3/signer/TestS3RestSigner.java | 16 +- .../iceberg/aws/s3/signer/S3ObjectMapper.java | 4 + .../iceberg/aws/s3/signer/S3SignRequest.java | 32 +-- .../aws/s3/signer/S3SignRequestParser.java | 91 ++------ .../iceberg/aws/s3/signer/S3SignResponse.java | 19 +- .../aws/s3/signer/S3SignResponseParser.java | 44 ++-- .../aws/s3/signer/S3V4RestSignerClient.java | 102 +++++++-- .../s3/signer/TestS3V4RestSignerClient.java | 87 +++++++- .../org/apache/iceberg/rest/Endpoint.java | 2 + .../iceberg/rest/RESTCatalogProperties.java | 13 ++ .../apache/iceberg/rest/RESTSerializers.java | 51 ++++- .../apache/iceberg/rest/ResourcePaths.java | 13 ++ .../rest/requests/RemoteSignRequest.java | 51 +++++ .../requests/RemoteSignRequestParser.java | 141 ++++++++++++ .../rest/responses/RemoteSignResponse.java | 35 +++ .../responses/RemoteSignResponseParser.java | 71 ++++++ .../iceberg/rest/RemoteSignerServlet.java | 202 ++++++++++++++++++ .../iceberg/rest/TestResourcePaths.java | 16 ++ .../requests/TestRemoteSignRequestParser.java | 94 +++++--- .../TestRemoteSignResponseParser.java | 26 +-- 21 files changed, 919 insertions(+), 389 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequest.java create mode 100644 core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequestParser.java create mode 100644 core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponse.java create mode 100644 core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponseParser.java create mode 100644 core/src/test/java/org/apache/iceberg/rest/RemoteSignerServlet.java rename aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignRequestParser.java => core/src/test/java/org/apache/iceberg/rest/requests/TestRemoteSignRequestParser.java (70%) rename aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignResponseParser.java => core/src/test/java/org/apache/iceberg/rest/responses/TestRemoteSignResponseParser.java (78%) diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java index 038d76b03e4b..5d334eafa582 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/S3SignerServlet.java @@ -18,17 +18,6 @@ */ package org.apache.iceberg.aws.s3.signer; -import static java.lang.String.format; -import static org.apache.iceberg.rest.RESTCatalogAdapter.castRequest; -import static org.apache.iceberg.rest.RESTCatalogAdapter.castResponse; -import static org.assertj.core.api.AssertionsForClassTypes.assertThat; - -import com.fasterxml.jackson.databind.ObjectMapper; -import jakarta.servlet.http.HttpServlet; -import jakarta.servlet.http.HttpServletRequest; -import jakarta.servlet.http.HttpServletResponse; -import java.io.InputStreamReader; -import java.io.Reader; import java.time.Clock; import java.time.Instant; import java.time.ZoneId; @@ -37,23 +26,15 @@ import java.util.Locale; import java.util.Map; import java.util.Set; -import java.util.function.Predicate; import java.util.stream.Collectors; -import java.util.stream.Stream; -import org.apache.hc.core5.http.ContentType; -import org.apache.hc.core5.http.HttpHeaders; -import org.apache.iceberg.exceptions.RESTException; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.relocated.com.google.common.io.CharStreams; -import org.apache.iceberg.rest.RESTUtil; -import org.apache.iceberg.rest.ResourcePaths; -import org.apache.iceberg.rest.responses.ErrorResponse; -import org.apache.iceberg.rest.responses.OAuthTokenResponse; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.iceberg.rest.HttpMethod; +import org.apache.iceberg.rest.RemoteSignerServlet; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.responses.ImmutableRemoteSignResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponse; import software.amazon.awssdk.auth.signer.AwsS3V4Signer; import software.amazon.awssdk.auth.signer.params.AwsS3V4SignerParams; import software.amazon.awssdk.http.SdkHttpFullRequest; @@ -65,113 +46,37 @@ * {@link S3SignerServlet} provides a simple servlet implementation to emulate the server-side * behavior of signing S3 requests and handling OAuth. */ -public class S3SignerServlet extends HttpServlet { - - private static final Logger LOG = LoggerFactory.getLogger(S3SignerServlet.class); +public class S3SignerServlet extends RemoteSignerServlet { static final Clock SIGNING_CLOCK = Clock.fixed(Instant.now(), ZoneId.of("UTC")); static final Set UNSIGNED_HEADERS = Sets.newHashSet( Arrays.asList("range", "x-amz-date", "amz-sdk-invocation-id", "amz-sdk-retry")); - private static final String POST = "POST"; - - private static final Set CACHEABLE_METHODS = - Stream.of(SdkHttpMethod.GET, SdkHttpMethod.HEAD).collect(Collectors.toSet()); - - private final Map responseHeaders = - ImmutableMap.of(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); - private final ObjectMapper mapper; - - private List s3SignRequestValidators = Lists.newArrayList(); - - /** - * SignRequestValidator is a wrapper class used for validating the contents of the S3SignRequest - * and thus verifying the behavior of the client during testing. - */ - public static class SignRequestValidator { - private final Predicate requestMatcher; - private final Predicate requestExpectation; - private final String assertMessage; - - public SignRequestValidator( - Predicate requestExpectation, - Predicate requestMatcher, - String assertMessage) { - this.requestExpectation = requestExpectation; - this.requestMatcher = requestMatcher; - this.assertMessage = assertMessage; - } - - void validateRequest(S3SignRequest request) { - if (requestMatcher.test(request)) { - assertThat(requestExpectation.test(request)).as(assertMessage).isTrue(); - } - } - } - - public S3SignerServlet(ObjectMapper mapper) { - this.mapper = mapper; - } - - public S3SignerServlet(ObjectMapper mapper, List s3SignRequestValidators) { - this.mapper = mapper; - this.s3SignRequestValidators = s3SignRequestValidators; - } - - @Override - protected void doGet(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); - } - @Override - protected void doHead(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); - } + /** A fake remote signing endpoint for testing purposes. */ + static final String S3_SIGNER_ENDPOINT = "v1/namespaces/ns1/tables/t1/sign"; - @Override - protected void doPost(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); + public S3SignerServlet() { + super(S3_SIGNER_ENDPOINT); } @Override - protected void doDelete(HttpServletRequest request, HttpServletResponse response) { - execute(request, response); - } - - private OAuthTokenResponse handleOAuth(Map requestMap) { - String grantType = requestMap.get("grant_type"); - switch (grantType) { - case "client_credentials": - return castResponse( - OAuthTokenResponse.class, - OAuthTokenResponse.builder() - .withToken("client-credentials-token:sub=" + requestMap.get("client_id")) - .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") - .withTokenType("Bearer") - .setExpirationInSeconds(10000) - .build()); - - case "urn:ietf:params:oauth:grant-type:token-exchange": - String actor = requestMap.get("actor_token"); - String token = - String.format( - "token-exchange-token:sub=%s%s", - requestMap.get("subject_token"), actor != null ? ",act=" + actor : ""); - return castResponse( - OAuthTokenResponse.class, - OAuthTokenResponse.builder() - .withToken(token) - .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") - .withTokenType("Bearer") - .setExpirationInSeconds(10000) - .build()); - - default: - throw new UnsupportedOperationException("Unsupported grant_type: " + grantType); + protected void validateSignRequest(RemoteSignRequest request) { + Preconditions.checkArgument( + request.provider() == null || "s3".equalsIgnoreCase(request.provider()), + "Unsupported provider: %s", + request.provider()); + if (HttpMethod.POST.name().equalsIgnoreCase(request.method()) + && request.uri().getQuery().contains("delete")) { + String body = request.body(); + Preconditions.checkArgument( + body != null && !body.isEmpty(), + "Sign request for delete objects should have a request body"); } } - private S3SignResponse signRequest(S3SignRequest request) { + @Override + protected RemoteSignResponse signRequest(RemoteSignRequest request) { AwsS3V4SignerParams signingParams = AwsS3V4SignerParams.builder() .awsCredentials(TestS3RestSigner.CREDENTIALS_PROVIDER.resolveCredentials()) @@ -207,59 +112,6 @@ private S3SignResponse signRequest(S3SignRequest request) { Map> headers = Maps.newHashMap(sign.headers()); headers.putAll(unsignedHeaders); - return ImmutableS3SignResponse.builder().uri(request.uri()).headers(headers).build(); - } - - protected void execute(HttpServletRequest request, HttpServletResponse response) { - response.setStatus(HttpServletResponse.SC_OK); - responseHeaders.forEach(response::setHeader); - - String path = request.getRequestURI().substring(1); - Object requestBody; - try { - // we only need to handle oauth tokens & s3 sign request routes here as those are the only - // requests that are being done by the S3V4RestSignerClient - if (POST.equals(request.getMethod()) - && S3V4RestSignerClient.S3_SIGNER_DEFAULT_ENDPOINT.equals(path)) { - S3SignRequest s3SignRequest = - castRequest( - S3SignRequest.class, mapper.readValue(request.getReader(), S3SignRequest.class)); - s3SignRequestValidators.forEach(validator -> validator.validateRequest(s3SignRequest)); - S3SignResponse s3SignResponse = signRequest(s3SignRequest); - if (CACHEABLE_METHODS.contains(SdkHttpMethod.fromValue(s3SignRequest.method()))) { - // tell the client this can be cached - response.setHeader( - S3V4RestSignerClient.CACHE_CONTROL, S3V4RestSignerClient.CACHE_CONTROL_PRIVATE); - } else { - response.setHeader( - S3V4RestSignerClient.CACHE_CONTROL, S3V4RestSignerClient.CACHE_CONTROL_NO_CACHE); - } - - mapper.writeValue(response.getWriter(), s3SignResponse); - } else if (POST.equals(request.getMethod()) && ResourcePaths.tokens().equals(path)) { - try (Reader reader = new InputStreamReader(request.getInputStream())) { - requestBody = RESTUtil.decodeFormData(CharStreams.toString(reader)); - } - - OAuthTokenResponse oAuthTokenResponse = - handleOAuth((Map) castRequest(Map.class, requestBody)); - mapper.writeValue(response.getWriter(), oAuthTokenResponse); - } else { - response.setStatus(HttpServletResponse.SC_BAD_REQUEST); - mapper.writeValue( - response.getWriter(), - ErrorResponse.builder() - .responseCode(400) - .withType("BadRequestException") - .withMessage(format("No route for request: %s %s", request.getMethod(), path)) - .build()); - } - } catch (RESTException e) { - LOG.error("Error processing REST request", e); - response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); - } catch (Exception e) { - LOG.error("Unexpected exception when processing REST request", e); - response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); - } + return ImmutableRemoteSignResponse.builder().uri(request.uri()).headers(headers).build(); } } diff --git a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java index d229976d5157..4e5ed3d91870 100644 --- a/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java +++ b/aws/src/integration/java/org/apache/iceberg/aws/s3/signer/TestS3RestSigner.java @@ -33,8 +33,8 @@ import java.util.stream.Collectors; import javax.annotation.Nonnull; import org.apache.iceberg.aws.s3.MinioUtil; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.rest.RESTCatalogProperties; import org.apache.iceberg.rest.auth.OAuth2Properties; import org.apache.iceberg.util.ThreadPools; import org.eclipse.jetty.compression.gzip.GzipCompression; @@ -108,8 +108,10 @@ public static void beforeClass() throws Exception { ImmutableS3V4RestSignerClient.builder() .properties( ImmutableMap.of( - S3V4RestSignerClient.S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, httpServer.getURI().toString(), + RESTCatalogProperties.SIGNER_ENDPOINT, + S3SignerServlet.S3_SIGNER_ENDPOINT, OAuth2Properties.CREDENTIAL, "catalog:12345")) .build(), @@ -183,15 +185,7 @@ public void before() throws Exception { } private static Server initHttpServer() throws Exception { - S3SignerServlet.SignRequestValidator deleteObjectsWithBody = - new S3SignerServlet.SignRequestValidator( - (s3SignRequest) -> - "post".equalsIgnoreCase(s3SignRequest.method()) - && s3SignRequest.uri().getQuery().contains("delete"), - (s3SignRequest) -> s3SignRequest.body() != null && !s3SignRequest.body().isEmpty(), - "Sign request for delete objects should have a request body"); - S3SignerServlet servlet = - new S3SignerServlet(S3ObjectMapper.mapper(), ImmutableList.of(deleteObjectsWithBody)); + S3SignerServlet servlet = new S3SignerServlet(); ServletContextHandler servletContext = new ServletContextHandler(ServletContextHandler.NO_SESSIONS); servletContext.addServlet(new ServletHolder(servlet), "/*"); diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java index 89145b2465e5..7f1d6c3cc848 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3ObjectMapper.java @@ -40,6 +40,10 @@ import org.apache.iceberg.rest.responses.ErrorResponse; import org.apache.iceberg.rest.responses.OAuthTokenResponse; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@code RESTObjectMapper} instead. + */ +@Deprecated public class S3ObjectMapper { private static final JsonFactory FACTORY = new JsonFactory(); diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java index 879ce8599352..995f6e7e4860 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequest.java @@ -18,31 +18,13 @@ */ package org.apache.iceberg.aws.s3.signer; -import java.net.URI; -import java.util.List; -import java.util.Map; -import javax.annotation.Nullable; -import org.apache.iceberg.rest.RESTRequest; +import org.apache.iceberg.rest.requests.RemoteSignRequest; import org.immutables.value.Value; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignRequest} instead. + */ +@Deprecated @Value.Immutable -public interface S3SignRequest extends RESTRequest { - String region(); - - String method(); - - URI uri(); - - Map> headers(); - - Map properties(); - - @Value.Default - @Nullable - default String body() { - return null; - } - - @Override - default void validate() {} -} +@SuppressWarnings("immutables:subtype") +public interface S3SignRequest extends RemoteSignRequest {} diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java index 3b5eb83612e2..5d2a7d684460 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignRequestParser.java @@ -21,108 +21,47 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; -import java.util.Arrays; import java.util.List; import java.util.Map; -import java.util.Map.Entry; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; -import org.apache.iceberg.util.JsonUtil; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.requests.RemoteSignRequestParser; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignRequestParser} instead. + */ +@Deprecated public class S3SignRequestParser { - private static final String REGION = "region"; - private static final String METHOD = "method"; - private static final String URI = "uri"; - private static final String HEADERS = "headers"; - private static final String PROPERTIES = "properties"; - private static final String BODY = "body"; - private S3SignRequestParser() {} public static String toJson(S3SignRequest request) { - return toJson(request, false); + return RemoteSignRequestParser.toJson(request, false); } public static String toJson(S3SignRequest request, boolean pretty) { - return JsonUtil.generate(gen -> toJson(request, gen), pretty); + return RemoteSignRequestParser.toJson(request, pretty); } public static void toJson(S3SignRequest request, JsonGenerator gen) throws IOException { - Preconditions.checkArgument(null != request, "Invalid s3 sign request: null"); - - gen.writeStartObject(); - - gen.writeStringField(REGION, request.region()); - gen.writeStringField(METHOD, request.method()); - gen.writeStringField(URI, request.uri().toString()); - headersToJson(HEADERS, request.headers(), gen); - - if (!request.properties().isEmpty()) { - JsonUtil.writeStringMap(PROPERTIES, request.properties(), gen); - } - - if (request.body() != null && !request.body().isEmpty()) { - gen.writeStringField(BODY, request.body()); - } - - gen.writeEndObject(); + RemoteSignRequestParser.toJson(request, gen); } public static S3SignRequest fromJson(String json) { - return JsonUtil.parse(json, S3SignRequestParser::fromJson); + RemoteSignRequest request = RemoteSignRequestParser.fromJson(json); + return ImmutableS3SignRequest.builder().from(request).build(); } public static S3SignRequest fromJson(JsonNode json) { - Preconditions.checkArgument(null != json, "Cannot parse s3 sign request from null object"); - Preconditions.checkArgument( - json.isObject(), "Cannot parse s3 sign request from non-object: %s", json); - - String region = JsonUtil.getString(REGION, json); - String method = JsonUtil.getString(METHOD, json); - java.net.URI uri = java.net.URI.create(JsonUtil.getString(URI, json)); - Map> headers = headersFromJson(HEADERS, json); - - ImmutableS3SignRequest.Builder builder = - ImmutableS3SignRequest.builder().region(region).method(method).uri(uri).headers(headers); - - if (json.has(PROPERTIES)) { - builder.properties(JsonUtil.getStringMap(PROPERTIES, json)); - } - - if (json.has(BODY)) { - builder.body(JsonUtil.getString(BODY, json)); - } - - return builder.build(); + RemoteSignRequest request = RemoteSignRequestParser.fromJson(json); + return ImmutableS3SignRequest.builder().from(request).build(); } static void headersToJson(String property, Map> headers, JsonGenerator gen) throws IOException { - gen.writeObjectFieldStart(property); - for (Entry> entry : headers.entrySet()) { - gen.writeFieldName(entry.getKey()); - - gen.writeStartArray(); - for (String val : entry.getValue()) { - gen.writeString(val); - } - gen.writeEndArray(); - } - gen.writeEndObject(); + RemoteSignRequestParser.headersToJson(property, headers, gen); } static Map> headersFromJson(String property, JsonNode json) { - Map> headers = Maps.newHashMap(); - JsonNode headersNode = JsonUtil.get(property, json); - headersNode - .properties() - .forEach( - entry -> { - String key = entry.getKey(); - List values = Arrays.asList(JsonUtil.getStringArray(entry.getValue())); - headers.put(key, values); - }); - return headers; + return RemoteSignRequestParser.headersFromJson(property, json); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java index 40c2059488f8..6fbaa90fe7af 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponse.java @@ -18,18 +18,13 @@ */ package org.apache.iceberg.aws.s3.signer; -import java.net.URI; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.rest.RESTResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponse; import org.immutables.value.Value; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignResponse} instead. + */ +@Deprecated @Value.Immutable -public interface S3SignResponse extends RESTResponse { - URI uri(); - - Map> headers(); - - @Override - default void validate() {} -} +@SuppressWarnings("immutables:subtype") +public interface S3SignResponse extends RemoteSignResponse {} diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java index 69d6de8f04ac..be63a51b38fb 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3SignResponseParser.java @@ -21,49 +21,37 @@ import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; import java.io.IOException; -import java.util.List; -import java.util.Map; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; -import org.apache.iceberg.util.JsonUtil; +import org.apache.iceberg.rest.responses.RemoteSignResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponseParser; +/** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link RemoteSignResponseParser} + * instead. + */ +@Deprecated public class S3SignResponseParser { - private static final String URI = "uri"; - private static final String HEADERS = "headers"; - private S3SignResponseParser() {} - public static String toJson(S3SignResponse request) { - return toJson(request, false); + public static String toJson(S3SignResponse response) { + return RemoteSignResponseParser.toJson(response, false); } - public static String toJson(S3SignResponse request, boolean pretty) { - return JsonUtil.generate(gen -> toJson(request, gen), pretty); + public static String toJson(S3SignResponse response, boolean pretty) { + return RemoteSignResponseParser.toJson(response, pretty); } public static void toJson(S3SignResponse response, JsonGenerator gen) throws IOException { - Preconditions.checkArgument(null != response, "Invalid s3 sign response: null"); - - gen.writeStartObject(); - - gen.writeStringField(URI, response.uri().toString()); - S3SignRequestParser.headersToJson(HEADERS, response.headers(), gen); - - gen.writeEndObject(); + RemoteSignResponseParser.toJson(response, gen); } public static S3SignResponse fromJson(String json) { - return JsonUtil.parse(json, S3SignResponseParser::fromJson); + RemoteSignResponse result = RemoteSignResponseParser.fromJson(json); + return ImmutableS3SignResponse.builder().from(result).build(); } public static S3SignResponse fromJson(JsonNode json) { - Preconditions.checkArgument(null != json, "Cannot parse s3 sign response from null object"); - Preconditions.checkArgument( - json.isObject(), "Cannot parse s3 sign response from non-object: %s", json); - - java.net.URI uri = java.net.URI.create(JsonUtil.getString(URI, json)); - Map> headers = S3SignRequestParser.headersFromJson(HEADERS, json); - - return ImmutableS3SignResponse.builder().uri(uri).headers(headers).build(); + RemoteSignResponse result = RemoteSignResponseParser.fromJson(json); + return ImmutableS3SignResponse.builder().from(result).build(); } } diff --git a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java index 84b67bbdafc2..7a463abd3d2d 100644 --- a/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java +++ b/aws/src/main/java/org/apache/iceberg/aws/s3/signer/S3V4RestSignerClient.java @@ -37,6 +37,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.rest.ErrorHandlers; import org.apache.iceberg.rest.HTTPClient; +import org.apache.iceberg.rest.RESTCatalogProperties; import org.apache.iceberg.rest.RESTClient; import org.apache.iceberg.rest.RESTUtil; import org.apache.iceberg.rest.ResourcePaths; @@ -45,6 +46,9 @@ import org.apache.iceberg.rest.auth.AuthSession; import org.apache.iceberg.rest.auth.OAuth2Properties; import org.apache.iceberg.rest.auth.OAuth2Util; +import org.apache.iceberg.rest.requests.ImmutableRemoteSignRequest; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.responses.RemoteSignResponse; import org.apache.iceberg.util.PropertyUtil; import org.immutables.value.Value; import org.slf4j.Logger; @@ -64,13 +68,30 @@ public abstract class S3V4RestSignerClient extends AbstractAws4Signer implements AutoCloseable { private static final Logger LOG = LoggerFactory.getLogger(S3V4RestSignerClient.class); - public static final String S3_SIGNER_URI = "s3.signer.uri"; - public static final String S3_SIGNER_ENDPOINT = "s3.signer.endpoint"; - static final String S3_SIGNER_DEFAULT_ENDPOINT = "v1/aws/s3/sign"; - static final String UNSIGNED_PAYLOAD = "UNSIGNED-PAYLOAD"; - static final String CACHE_CONTROL = "Cache-Control"; - static final String CACHE_CONTROL_PRIVATE = "private"; - static final String CACHE_CONTROL_NO_CACHE = "no-cache"; + + public static final String S3_PROVIDER = "s3"; + + /** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link + * RESTCatalogProperties#SIGNER_URI} instead. + */ + @Deprecated public static final String S3_SIGNER_URI = "s3.signer.uri"; + + /** + * @deprecated since 1.11.0, will be removed in 1.12.0; use {@link + * RESTCatalogProperties#SIGNER_URI} instead. + */ + @Deprecated public static final String S3_SIGNER_ENDPOINT = "s3.signer.endpoint"; + + /** + * @deprecated since 1.11.0, will be removed in 1.12.0; there is no replacement. + */ + @Deprecated static final String S3_SIGNER_DEFAULT_ENDPOINT = "v1/aws/s3/sign"; + + @VisibleForTesting static final String UNSIGNED_PAYLOAD = "UNSIGNED-PAYLOAD"; + + private static final String CACHE_CONTROL = "Cache-Control"; + private static final String CACHE_CONTROL_PRIVATE = "private"; private static final Cache SIGNED_COMPONENT_CACHE = Caffeine.newBuilder().expireAfterWrite(30, TimeUnit.SECONDS).maximumSize(100).build(); @@ -94,13 +115,28 @@ public Supplier> requestPropertiesSupplier() { @Value.Lazy public String baseSignerUri() { - return properties().getOrDefault(S3_SIGNER_URI, properties().get(CatalogProperties.URI)); + // TODO remove in 1.12.0 + if (properties().containsKey(S3_SIGNER_URI)) { + return properties().get(S3_SIGNER_URI); + } + + return properties() + .getOrDefault(RESTCatalogProperties.SIGNER_URI, properties().get(CatalogProperties.URI)); } @Value.Lazy public String endpoint() { - return RESTUtil.resolveEndpoint( - baseSignerUri(), properties().getOrDefault(S3_SIGNER_ENDPOINT, S3_SIGNER_DEFAULT_ENDPOINT)); + // TODO remove in 1.12.0 + String endpointPath; + if (properties().containsKey(S3_SIGNER_ENDPOINT)) { + endpointPath = properties().get(S3_SIGNER_ENDPOINT); + } else { + endpointPath = + properties() + .getOrDefault(RESTCatalogProperties.SIGNER_ENDPOINT, S3_SIGNER_DEFAULT_ENDPOINT); + } + + return RESTUtil.resolveEndpoint(baseSignerUri(), endpointPath); } /** A credential to exchange for a token in the OAuth2 client credentials flow. */ @@ -160,7 +196,6 @@ private RESTClient httpClient() { httpClient = HTTPClient.builder(properties()) .withHeaders(RESTUtil.configHeaders(properties())) - .withObjectMapper(S3ObjectMapper.mapper()) .build(); } } @@ -197,8 +232,36 @@ private boolean credentialProvided() { @Value.Check protected void check() { Preconditions.checkArgument( - properties().containsKey(S3_SIGNER_URI) || properties().containsKey(CatalogProperties.URI), + properties().containsKey(S3_SIGNER_URI) + || properties().containsKey(RESTCatalogProperties.SIGNER_URI) + || properties().containsKey(CatalogProperties.URI), "S3 signer service URI is required"); + + if (properties().containsKey(S3_SIGNER_URI) + && !properties().containsKey(RESTCatalogProperties.SIGNER_URI)) { + LOG.warn( + "S3 signer URI is configured via deprecated property {}, this won't be supported in future releases. " + + "Please use {} instead.", + S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI); + } + + if (properties().containsKey(S3_SIGNER_ENDPOINT) + && !properties().containsKey(RESTCatalogProperties.SIGNER_ENDPOINT)) { + LOG.warn( + "Signer endpoint is configured via deprecated property {}, this won't be supported in future releases. " + + "Please use {} instead.", + S3_SIGNER_ENDPOINT, + RESTCatalogProperties.SIGNER_ENDPOINT); + } + + // TODO change to required in 1.12.0 + if (!properties().containsKey(S3_SIGNER_ENDPOINT) + && !properties().containsKey(RESTCatalogProperties.SIGNER_ENDPOINT)) { + LOG.warn( + "Signer endpoint is not set, this won't be supported in future releases. Using deprecated default: {}", + S3_SIGNER_DEFAULT_ENDPOINT); + } } @Override @@ -241,14 +304,15 @@ public SdkHttpFullRequest sign( AwsS3V4SignerParams signerParams = extractSignerParams(AwsS3V4SignerParams.builder(), executionAttributes).build(); - S3SignRequest remoteSigningRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest remoteSigningRequest = + ImmutableRemoteSignRequest.builder() .method(request.method().name()) .region(signerParams.signingRegion().id()) .uri(request.getUri()) .headers(request.headers()) .properties(requestPropertiesSupplier().get()) .body(bodyAsString(request)) + .provider(S3_PROVIDER) .build(); Key cacheKey = Key.from(remoteSigningRequest); @@ -260,21 +324,21 @@ public SdkHttpFullRequest sign( } else { Map responseHeaders = Maps.newHashMap(); Consumer> responseHeadersConsumer = responseHeaders::putAll; - S3SignResponse s3SignResponse = + RemoteSignResponse remoteSignResponse = httpClient() .withAuthSession(authSession()) .post( endpoint(), remoteSigningRequest, - S3SignResponse.class, + RemoteSignResponse.class, Map.of(), ErrorHandlers.defaultErrorHandler(), responseHeadersConsumer); signedComponent = ImmutableSignedComponent.builder() - .headers(s3SignResponse.headers()) - .signedURI(s3SignResponse.uri()) + .headers(remoteSignResponse.headers()) + .signedURI(remoteSignResponse.uri()) .build(); if (canBeCached(responseHeaders)) { @@ -351,7 +415,7 @@ interface Key { String uri(); - static Key from(S3SignRequest request) { + static Key from(RemoteSignRequest request) { return ImmutableKey.builder() .method(request.method()) .region(request.region()) diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java index 0bcc77e29fae..aadbf036b567 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3V4RestSignerClient.java @@ -18,13 +18,14 @@ */ package org.apache.iceberg.aws.s3.signer; -import static org.apache.iceberg.aws.s3.signer.S3V4RestSignerClient.S3_SIGNER_URI; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.InstanceOfAssertFactories.type; import static org.mockito.Mockito.when; import java.util.Map; import java.util.stream.Stream; +import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.rest.RESTCatalogProperties; import org.apache.iceberg.rest.RESTClient; import org.apache.iceberg.rest.auth.AuthProperties; import org.apache.iceberg.rest.auth.AuthSession; @@ -119,12 +120,21 @@ void authSessionOAuth2(Map properties, String expectedScope, Str public static Stream validOAuth2Properties() { return Stream.of( // No OAuth2 data - Arguments.of(Map.of(S3_SIGNER_URI, "https://signer.com"), "sign", null), + Arguments.of( + Map.of( + RESTCatalogProperties.SIGNER_URI, + "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3"), + "sign", + null), // Token only Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.TOKEN, @@ -134,8 +144,10 @@ public static Stream validOAuth2Properties() { // Credential only: expect a token to be fetched Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.CREDENTIAL, @@ -145,8 +157,10 @@ public static Stream validOAuth2Properties() { // Token and credential: should use token as is, not fetch a new one Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.TOKEN, @@ -158,8 +172,10 @@ public static Stream validOAuth2Properties() { // Custom scope Arguments.of( Map.of( - S3_SIGNER_URI, + RESTCatalogProperties.SIGNER_URI, "https://signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/sign/s3", AuthProperties.AUTH_TYPE, AuthProperties.AUTH_TYPE_OAUTH2, OAuth2Properties.CREDENTIAL, @@ -169,4 +185,63 @@ public static Stream validOAuth2Properties() { "custom", "token")); } + + @ParameterizedTest + @MethodSource("legacySignerProperties") + void legacySignerProperties( + Map properties, String expectedBaseSignerUri, String expectedEndpoint) + throws Exception { + try (S3V4RestSignerClient client = + ImmutableS3V4RestSignerClient.builder().properties(properties).build()) { + assertThat(client.baseSignerUri()).isEqualTo(expectedBaseSignerUri); + assertThat(client.endpoint()).isEqualTo(expectedEndpoint); + } + } + + @SuppressWarnings("deprecation") + public static Stream legacySignerProperties() { + return Stream.of( + // Only legacy properties + Arguments.of( + Map.of( + CatalogProperties.URI, + "https://catalog.com", + S3V4RestSignerClient.S3_SIGNER_URI, + "https://legacy-signer.com", + S3V4RestSignerClient.S3_SIGNER_ENDPOINT, + "v1/legacy/sign"), + "https://legacy-signer.com", + "https://legacy-signer.com/v1/legacy/sign"), + // Only new properties + Arguments.of( + Map.of( + CatalogProperties.URI, + "https://catalog.com", + RESTCatalogProperties.SIGNER_URI, + "https://new-signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/new/sign"), + "https://new-signer.com", + "https://new-signer.com/v1/new/sign"), + // Mixed properties: legacy properties take precedence + Arguments.of( + Map.of( + CatalogProperties.URI, + "https://catalog.com", + RESTCatalogProperties.SIGNER_URI, + "https://new-signer.com", + RESTCatalogProperties.SIGNER_ENDPOINT, + "v1/new/sign", + S3V4RestSignerClient.S3_SIGNER_URI, + "https://legacy-signer.com", + S3V4RestSignerClient.S3_SIGNER_ENDPOINT, + "v1/legacy/sign"), + "https://legacy-signer.com", + "https://legacy-signer.com/v1/legacy/sign"), + // No signer properties: the catalog URI and the deprecated default endpoint are used + Arguments.of( + Map.of(CatalogProperties.URI, "https://catalog.com"), + "https://catalog.com", + "https://catalog.com/" + S3V4RestSignerClient.S3_SIGNER_DEFAULT_ENDPOINT)); + } } diff --git a/core/src/main/java/org/apache/iceberg/rest/Endpoint.java b/core/src/main/java/org/apache/iceberg/rest/Endpoint.java index c2369a0fa57d..d56a14d18954 100644 --- a/core/src/main/java/org/apache/iceberg/rest/Endpoint.java +++ b/core/src/main/java/org/apache/iceberg/rest/Endpoint.java @@ -66,6 +66,8 @@ public class Endpoint { Endpoint.create("POST", ResourcePaths.V1_TABLE_METRICS); public static final Endpoint V1_TABLE_CREDENTIALS = Endpoint.create("GET", ResourcePaths.V1_TABLE_CREDENTIALS); + public static final Endpoint V1_TABLE_REMOTE_SIGN = + Endpoint.create("POST", ResourcePaths.V1_TABLE_REMOTE_SIGN); // table scan plan endpoints public static final Endpoint V1_SUBMIT_TABLE_SCAN_PLAN = diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java b/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java index 9f4d8835a71f..a5aa10bafc61 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTCatalogProperties.java @@ -22,6 +22,7 @@ import java.util.Locale; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import org.apache.iceberg.CatalogProperties; public final class RESTCatalogProperties { @@ -95,4 +96,16 @@ public static ScanPlanningMode fromString(String mode) { .collect(Collectors.joining(", ")))); } } + + /** + * The base URI of the remote signer endpoint. Optional, defaults to {@link + * CatalogProperties#URI}. + */ + public static final String SIGNER_URI = "signer.uri"; + + /** + * The endpoint path of the remote signer endpoint. If remote signing has been requested, this + * must be set. + */ + public static final String SIGNER_ENDPOINT = "signer.endpoint"; } diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java b/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java index a429018c33d5..366bbfa9171b 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTSerializers.java @@ -53,6 +53,7 @@ import org.apache.iceberg.rest.requests.ImmutableCreateViewRequest; import org.apache.iceberg.rest.requests.ImmutableRegisterTableRequest; import org.apache.iceberg.rest.requests.ImmutableRegisterViewRequest; +import org.apache.iceberg.rest.requests.ImmutableRemoteSignRequest; import org.apache.iceberg.rest.requests.ImmutableReportMetricsRequest; import org.apache.iceberg.rest.requests.PlanTableScanRequest; import org.apache.iceberg.rest.requests.PlanTableScanRequestParser; @@ -60,6 +61,8 @@ import org.apache.iceberg.rest.requests.RegisterTableRequestParser; import org.apache.iceberg.rest.requests.RegisterViewRequest; import org.apache.iceberg.rest.requests.RegisterViewRequestParser; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.requests.RemoteSignRequestParser; import org.apache.iceberg.rest.requests.ReportMetricsRequest; import org.apache.iceberg.rest.requests.ReportMetricsRequestParser; import org.apache.iceberg.rest.requests.UpdateTableRequest; @@ -74,6 +77,7 @@ import org.apache.iceberg.rest.responses.FetchScanTasksResponseParser; import org.apache.iceberg.rest.responses.ImmutableLoadCredentialsResponse; import org.apache.iceberg.rest.responses.ImmutableLoadViewResponse; +import org.apache.iceberg.rest.responses.ImmutableRemoteSignResponse; import org.apache.iceberg.rest.responses.LoadCredentialsResponse; import org.apache.iceberg.rest.responses.LoadCredentialsResponseParser; import org.apache.iceberg.rest.responses.LoadTableResponse; @@ -83,6 +87,8 @@ import org.apache.iceberg.rest.responses.OAuthTokenResponse; import org.apache.iceberg.rest.responses.PlanTableScanResponse; import org.apache.iceberg.rest.responses.PlanTableScanResponseParser; +import org.apache.iceberg.rest.responses.RemoteSignResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponseParser; import org.apache.iceberg.util.JsonUtil; public class RESTSerializers { @@ -160,7 +166,15 @@ public static void registerAll(ObjectMapper mapper) { ImmutableLoadCredentialsResponse.class, new LoadCredentialsResponseSerializer<>()) .addDeserializer(LoadCredentialsResponse.class, new LoadCredentialsResponseDeserializer<>()) .addDeserializer( - ImmutableLoadCredentialsResponse.class, new LoadCredentialsResponseDeserializer<>()); + ImmutableLoadCredentialsResponse.class, new LoadCredentialsResponseDeserializer<>()) + .addSerializer(RemoteSignRequest.class, new RemoteSignRequestSerializer<>()) + .addSerializer(ImmutableRemoteSignRequest.class, new RemoteSignRequestSerializer<>()) + .addDeserializer(RemoteSignRequest.class, new RemoteSignRequestDeserializer<>()) + .addDeserializer(ImmutableRemoteSignRequest.class, new RemoteSignRequestDeserializer<>()) + .addSerializer(RemoteSignResponse.class, new RemoteSignResponseSerializer<>()) + .addSerializer(ImmutableRemoteSignResponse.class, new RemoteSignResponseSerializer<>()) + .addDeserializer(RemoteSignResponse.class, new RemoteSignResponseDeserializer<>()) + .addDeserializer(ImmutableRemoteSignResponse.class, new RemoteSignResponseDeserializer<>()); mapper.registerModule(module); } @@ -650,4 +664,39 @@ boolean isCaseSensitive() { return caseSensitive; } } + + static class RemoteSignRequestSerializer extends JsonSerializer { + @Override + public void serialize(T request, JsonGenerator gen, SerializerProvider serializers) + throws IOException { + RemoteSignRequestParser.toJson(request, gen); + } + } + + static class RemoteSignRequestDeserializer + extends JsonDeserializer { + @Override + public T deserialize(JsonParser p, DeserializationContext context) throws IOException { + JsonNode jsonNode = p.getCodec().readTree(p); + return (T) RemoteSignRequestParser.fromJson(jsonNode); + } + } + + static class RemoteSignResponseSerializer + extends JsonSerializer { + @Override + public void serialize(T response, JsonGenerator gen, SerializerProvider serializers) + throws IOException { + RemoteSignResponseParser.toJson(response, gen); + } + } + + static class RemoteSignResponseDeserializer + extends JsonDeserializer { + @Override + public T deserialize(JsonParser p, DeserializationContext context) throws IOException { + JsonNode jsonNode = p.getCodec().readTree(p); + return (T) RemoteSignResponseParser.fromJson(jsonNode); + } + } } diff --git a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java index a5dea35bf1c9..be2fde22053d 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java +++ b/core/src/main/java/org/apache/iceberg/rest/ResourcePaths.java @@ -35,6 +35,8 @@ public class ResourcePaths { public static final String V1_TABLE = "/v1/{prefix}/namespaces/{namespace}/tables/{table}"; public static final String V1_TABLE_CREDENTIALS = "/v1/{prefix}/namespaces/{namespace}/tables/{table}/credentials"; + public static final String V1_TABLE_REMOTE_SIGN = + "/v1/{prefix}/namespaces/{namespace}/tables/{table}/sign"; public static final String V1_TABLE_REGISTER = "/v1/{prefix}/namespaces/{namespace}/register"; public static final String V1_TABLE_METRICS = "/v1/{prefix}/namespaces/{namespace}/tables/{table}/metrics"; @@ -130,6 +132,17 @@ public String metrics(TableIdentifier identifier) { "metrics"); } + public String remoteSign(TableIdentifier identifier) { + return SLASH.join( + "v1", + prefix, + "namespaces", + pathEncode(identifier.namespace()), + "tables", + RESTUtil.encodeString(identifier.name()), + "sign"); + } + public String commitTransaction() { return SLASH.join("v1", prefix, "transactions", "commit"); } diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequest.java b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequest.java new file mode 100644 index 000000000000..561007c480eb --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequest.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.requests; + +import java.net.URI; +import java.util.List; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.iceberg.rest.RESTRequest; +import org.immutables.value.Value; + +@Value.Immutable +public interface RemoteSignRequest extends RESTRequest { + String region(); + + String method(); + + URI uri(); + + Map> headers(); + + Map properties(); + + @Value.Default + @Nullable + default String body() { + return null; + } + + @Nullable + String provider(); + + @Override + default void validate() {} +} diff --git a/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequestParser.java b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequestParser.java new file mode 100644 index 000000000000..61b44cc177d1 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/requests/RemoteSignRequestParser.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.requests; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.JsonUtil; + +public class RemoteSignRequestParser { + + private static final String REGION = "region"; + private static final String METHOD = "method"; + private static final String URI = "uri"; + private static final String HEADERS = "headers"; + private static final String PROPERTIES = "properties"; + private static final String BODY = "body"; + private static final String PROVIDER = "provider"; + + private RemoteSignRequestParser() {} + + public static String toJson(RemoteSignRequest request) { + return toJson(request, false); + } + + public static String toJson(RemoteSignRequest request, boolean pretty) { + return JsonUtil.generate(gen -> toJson(request, gen), pretty); + } + + public static void toJson(RemoteSignRequest request, JsonGenerator gen) throws IOException { + Preconditions.checkArgument(null != request, "Invalid remote sign request: null"); + + gen.writeStartObject(); + + gen.writeStringField(REGION, request.region()); + gen.writeStringField(METHOD, request.method()); + gen.writeStringField(URI, request.uri().toString()); + headersToJson(HEADERS, request.headers(), gen); + + if (!request.properties().isEmpty()) { + JsonUtil.writeStringMap(PROPERTIES, request.properties(), gen); + } + + if (request.body() != null && !request.body().isEmpty()) { + gen.writeStringField(BODY, request.body()); + } + + if (request.provider() != null) { + gen.writeStringField(PROVIDER, request.provider()); + } + + gen.writeEndObject(); + } + + public static RemoteSignRequest fromJson(String json) { + return JsonUtil.parse(json, RemoteSignRequestParser::fromJson); + } + + public static RemoteSignRequest fromJson(JsonNode json) { + Preconditions.checkArgument(null != json, "Cannot parse remote sign request from null object"); + Preconditions.checkArgument( + json.isObject(), "Cannot parse remote sign request from non-object: %s", json); + + String region = JsonUtil.getString(REGION, json); + String method = JsonUtil.getString(METHOD, json); + java.net.URI uri = java.net.URI.create(JsonUtil.getString(URI, json)); + Map> headers = headersFromJson(HEADERS, json); + + ImmutableRemoteSignRequest.Builder builder = + ImmutableRemoteSignRequest.builder() + .region(region) + .method(method) + .uri(uri) + .headers(headers); + + if (json.has(PROPERTIES)) { + builder.properties(JsonUtil.getStringMap(PROPERTIES, json)); + } + + if (json.has(BODY)) { + builder.body(JsonUtil.getString(BODY, json)); + } + + if (json.has(PROVIDER)) { + builder.provider(JsonUtil.getString(PROVIDER, json)); + } + + return builder.build(); + } + + public static void headersToJson( + String property, Map> headers, JsonGenerator gen) throws IOException { + gen.writeObjectFieldStart(property); + for (Entry> entry : headers.entrySet()) { + gen.writeFieldName(entry.getKey()); + + gen.writeStartArray(); + for (String val : entry.getValue()) { + gen.writeString(val); + } + gen.writeEndArray(); + } + gen.writeEndObject(); + } + + public static Map> headersFromJson(String property, JsonNode json) { + Map> headers = Maps.newHashMap(); + JsonNode headersNode = JsonUtil.get(property, json); + headersNode + .properties() + .forEach( + entry -> { + String key = entry.getKey(); + List values = Arrays.asList(JsonUtil.getStringArray(entry.getValue())); + headers.put(key, values); + }); + return headers; + } +} diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponse.java b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponse.java new file mode 100644 index 000000000000..c5009505bf4f --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponse.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.responses; + +import java.net.URI; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.rest.RESTResponse; +import org.immutables.value.Value; + +@Value.Immutable +public interface RemoteSignResponse extends RESTResponse { + URI uri(); + + Map> headers(); + + @Override + default void validate() {} +} diff --git a/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponseParser.java b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponseParser.java new file mode 100644 index 000000000000..f53e844c6162 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/rest/responses/RemoteSignResponseParser.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest.responses; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; +import java.net.URI; +import java.util.List; +import java.util.Map; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.rest.requests.RemoteSignRequestParser; +import org.apache.iceberg.util.JsonUtil; + +public class RemoteSignResponseParser { + + private static final String URI_FIELD = "uri"; + private static final String HEADERS = "headers"; + + private RemoteSignResponseParser() {} + + public static String toJson(RemoteSignResponse response) { + return toJson(response, false); + } + + public static String toJson(RemoteSignResponse response, boolean pretty) { + return JsonUtil.generate(gen -> toJson(response, gen), pretty); + } + + public static void toJson(RemoteSignResponse response, JsonGenerator gen) throws IOException { + Preconditions.checkArgument(null != response, "Invalid remote sign response: null"); + + gen.writeStartObject(); + + gen.writeStringField(URI_FIELD, response.uri().toString()); + RemoteSignRequestParser.headersToJson(HEADERS, response.headers(), gen); + + gen.writeEndObject(); + } + + public static RemoteSignResponse fromJson(String json) { + return JsonUtil.parse(json, RemoteSignResponseParser::fromJson); + } + + public static RemoteSignResponse fromJson(JsonNode json) { + Preconditions.checkArgument(null != json, "Cannot parse remote sign response from null object"); + Preconditions.checkArgument( + json.isObject(), "Cannot parse remote sign response from non-object: %s", json); + + URI uri = URI.create(JsonUtil.getString(URI_FIELD, json)); + Map> headers = RemoteSignRequestParser.headersFromJson(HEADERS, json); + + return ImmutableRemoteSignResponse.builder().uri(uri).headers(headers).build(); + } +} diff --git a/core/src/test/java/org/apache/iceberg/rest/RemoteSignerServlet.java b/core/src/test/java/org/apache/iceberg/rest/RemoteSignerServlet.java new file mode 100644 index 000000000000..c55224c00b2f --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/rest/RemoteSignerServlet.java @@ -0,0 +1,202 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.rest; + +import static java.lang.String.format; +import static org.apache.iceberg.rest.RESTCatalogAdapter.castRequest; +import static org.apache.iceberg.rest.RESTCatalogAdapter.castResponse; + +import jakarta.servlet.http.HttpServlet; +import jakarta.servlet.http.HttpServletRequest; +import jakarta.servlet.http.HttpServletResponse; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import org.apache.hc.core5.http.ContentType; +import org.apache.hc.core5.http.HttpHeaders; +import org.apache.iceberg.exceptions.RESTException; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.io.CharStreams; +import org.apache.iceberg.rest.requests.RemoteSignRequest; +import org.apache.iceberg.rest.responses.OAuthTokenResponse; +import org.apache.iceberg.rest.responses.RemoteSignResponse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Base servlet for remote signing tests. This servlet handles OAuth token requests and delegates + * signing to subclasses. It does not handle any other requests. + * + *

      Subclasses must implement {@link #signRequest(RemoteSignRequest)} to provide the actual + * signing logic. + */ +public abstract class RemoteSignerServlet extends HttpServlet { + + private static final Logger LOG = LoggerFactory.getLogger(RemoteSignerServlet.class); + private static final String POST = "POST"; + + private static final String CACHE_CONTROL = "Cache-Control"; + private static final String CACHE_CONTROL_PRIVATE = "private"; + private static final String CACHE_CONTROL_NO_CACHE = "no-cache"; + + private static final Set CACHEABLE_METHODS = Set.of("GET", "HEAD"); + + private static final Map RESPONSE_HEADERS = + ImmutableMap.of(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); + + private final String signEndpoint; + + protected RemoteSignerServlet(String signEndpoint) { + this.signEndpoint = signEndpoint; + } + + @Override + protected void doGet(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + @Override + protected void doHead(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + @Override + protected void doPost(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + @Override + protected void doDelete(HttpServletRequest request, HttpServletResponse response) { + execute(request, response); + } + + /** + * Sign the given request and return the signed response. + * + * @param request the remote sign request + * @return the signed response + */ + protected abstract RemoteSignResponse signRequest(RemoteSignRequest request); + + /** + * Called after a sign request is parsed but before signing. Subclasses can override to add + * additional validation. + * + * @param request the remote sign request + */ + protected void validateSignRequest(RemoteSignRequest request) { + // no-op by default + } + + /** + * Called after signing to allow subclasses to add response headers (e.g., cache control). By + * default, this method adds cache control headers based on the request method. + * + * @param request the original sign request + * @param response the HTTP response to add headers to + */ + protected void addSignResponseHeaders(RemoteSignRequest request, HttpServletResponse response) { + if (CACHEABLE_METHODS.contains(request.method().toUpperCase(Locale.ROOT))) { + // tell the client this can be cached + response.setHeader(CACHE_CONTROL, CACHE_CONTROL_PRIVATE); + } else { + response.setHeader(CACHE_CONTROL, CACHE_CONTROL_NO_CACHE); + } + } + + private OAuthTokenResponse handleOAuth(Map requestMap) { + String grantType = requestMap.get("grant_type"); + switch (grantType) { + case "client_credentials": + return castResponse( + OAuthTokenResponse.class, + OAuthTokenResponse.builder() + .withToken("client-credentials-token:sub=" + requestMap.get("client_id")) + .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") + .withTokenType("Bearer") + .setExpirationInSeconds(10000) + .build()); + + case "urn:ietf:params:oauth:grant-type:token-exchange": + String actor = requestMap.get("actor_token"); + String token = + String.format( + "token-exchange-token:sub=%s%s", + requestMap.get("subject_token"), actor != null ? ",act=" + actor : ""); + return castResponse( + OAuthTokenResponse.class, + OAuthTokenResponse.builder() + .withToken(token) + .withIssuedTokenType("urn:ietf:params:oauth:token-type:access_token") + .withTokenType("Bearer") + .setExpirationInSeconds(10000) + .build()); + + default: + throw new UnsupportedOperationException("Unsupported grant_type: " + grantType); + } + } + + protected void execute(HttpServletRequest request, HttpServletResponse response) { + response.setStatus(HttpServletResponse.SC_OK); + RESPONSE_HEADERS.forEach(response::setHeader); + + String path = request.getRequestURI().substring(1); + Object requestBody; + try { + if (POST.equals(request.getMethod()) && signEndpoint.equals(path)) { + RemoteSignRequest signRequest = + castRequest( + RemoteSignRequest.class, + RESTObjectMapper.mapper().readValue(request.getReader(), RemoteSignRequest.class)); + validateSignRequest(signRequest); + RemoteSignResponse signResponse = signRequest(signRequest); + addSignResponseHeaders(signRequest, response); + RESTObjectMapper.mapper().writeValue(response.getWriter(), signResponse); + } else if (POST.equals(request.getMethod()) && ResourcePaths.tokens().equals(path)) { + try (Reader reader = new InputStreamReader(request.getInputStream())) { + requestBody = RESTUtil.decodeFormData(CharStreams.toString(reader)); + } + + @SuppressWarnings("unchecked") + OAuthTokenResponse oAuthTokenResponse = + handleOAuth((Map) castRequest(Map.class, requestBody)); + RESTObjectMapper.mapper().writeValue(response.getWriter(), oAuthTokenResponse); + } else { + response.setStatus(HttpServletResponse.SC_BAD_REQUEST); + RESTObjectMapper.mapper() + .writeValue( + response.getWriter(), + org.apache.iceberg.rest.responses.ErrorResponse.builder() + .responseCode(400) + .withType("BadRequestException") + .withMessage(format("No route for request: %s %s", request.getMethod(), path)) + .build()); + } + } catch (RESTException e) { + LOG.error("Error processing REST request", e); + response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); + } catch (Exception e) { + LOG.error("Unexpected exception when processing REST request", e); + response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR); + } + } +} diff --git a/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java b/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java index f40b1302f90e..a742b89a7627 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestResourcePaths.java @@ -336,4 +336,20 @@ public void cancelPlanEndpointPath() { assertThat(withoutPrefix.plan(complexId, "plan-xyz-789")) .isEqualTo("v1/namespaces/db%1Fschema/tables/my_table/plan/plan-xyz-789"); } + + @Test + public void testRemoteSign() { + TableIdentifier tableId = TableIdentifier.of("test_namespace", "test_table"); + assertThat(withPrefix.remoteSign(tableId)) + .isEqualTo("v1/ws/catalog/namespaces/test_namespace/tables/test_table/sign"); + assertThat(withoutPrefix.remoteSign(tableId)) + .isEqualTo("v1/namespaces/test_namespace/tables/test_table/sign"); + + // Test with different identifiers + TableIdentifier complexId = TableIdentifier.of(Namespace.of("db", "schema"), "my_table"); + assertThat(withPrefix.remoteSign(complexId)) + .isEqualTo("v1/ws/catalog/namespaces/db%1Fschema/tables/my_table/sign"); + assertThat(withoutPrefix.remoteSign(complexId)) + .isEqualTo("v1/namespaces/db%1Fschema/tables/my_table/sign"); + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignRequestParser.java b/core/src/test/java/org/apache/iceberg/rest/requests/TestRemoteSignRequestParser.java similarity index 70% rename from aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignRequestParser.java rename to core/src/test/java/org/apache/iceberg/rest/requests/TestRemoteSignRequestParser.java index 75ae2d88cccf..3515588e444d 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignRequestParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/requests/TestRemoteSignRequestParser.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.aws.s3.signer; +package org.apache.iceberg.rest.requests; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -28,37 +28,39 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; -public class TestS3SignRequestParser { +public class TestRemoteSignRequestParser { @Test public void nullRequest() { - assertThatThrownBy(() -> S3SignRequestParser.fromJson((JsonNode) null)) + assertThatThrownBy(() -> RemoteSignRequestParser.fromJson((JsonNode) null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse s3 sign request from null object"); + .hasMessage("Cannot parse remote sign request from null object"); - assertThatThrownBy(() -> S3SignRequestParser.toJson(null)) + assertThatThrownBy(() -> RemoteSignRequestParser.toJson(null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid s3 sign request: null"); + .hasMessage("Invalid remote sign request: null"); } @Test public void missingFields() { - assertThatThrownBy(() -> S3SignRequestParser.fromJson("{}")) + assertThatThrownBy(() -> RemoteSignRequestParser.fromJson("{}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: region"); - assertThatThrownBy(() -> S3SignRequestParser.fromJson("{\"region\":\"us-west-2\"}")) + assertThatThrownBy(() -> RemoteSignRequestParser.fromJson("{\"region\":\"us-west-2\"}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: method"); assertThatThrownBy( - () -> S3SignRequestParser.fromJson("{\"region\":\"us-west-2\", \"method\" : \"PUT\"}")) + () -> + RemoteSignRequestParser.fromJson( + "{\"region\":\"us-west-2\", \"method\" : \"PUT\"}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: uri"); assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : \"us-west-2\",\n" + " \"method\" : \"PUT\",\n" @@ -72,7 +74,7 @@ public void missingFields() { public void invalidMethod() { assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : \"us-west-2\",\n" + " \"method\" : 23,\n" @@ -87,7 +89,7 @@ public void invalidMethod() { public void invalidUri() { assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : \"us-west-2\",\n" + " \"method\" : \"PUT\",\n" @@ -102,7 +104,7 @@ public void invalidUri() { public void invalidRegion() { assertThatThrownBy( () -> - S3SignRequestParser.fromJson( + RemoteSignRequestParser.fromJson( "{\n" + " \"region\" : 23,\n" + " \"method\" : \"PUT\",\n" @@ -115,8 +117,8 @@ public void invalidRegion() { @Test public void roundTripSerde() { - ImmutableS3SignRequest s3SignRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .method("PUT") .region("us-west-2") @@ -132,8 +134,8 @@ public void roundTripSerde() { Arrays.asList("aws-sdk-java/2.20.18", "Linux/5.4.0-126"))) .build(); - String json = S3SignRequestParser.toJson(s3SignRequest, true); - assertThat(S3SignRequestParser.fromJson(json)).isEqualTo(s3SignRequest); + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); assertThat(json) .isEqualTo( "{\n" @@ -151,8 +153,8 @@ public void roundTripSerde() { @Test public void roundTripSerdeWithProperties() { - ImmutableS3SignRequest s3SignRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .method("PUT") .region("us-west-2") @@ -169,8 +171,8 @@ public void roundTripSerdeWithProperties() { .properties(ImmutableMap.of("k1", "v1")) .build(); - String json = S3SignRequestParser.toJson(s3SignRequest, true); - assertThat(S3SignRequestParser.fromJson(json)).isEqualTo(s3SignRequest); + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); assertThat(json) .isEqualTo( "{\n" @@ -191,8 +193,8 @@ public void roundTripSerdeWithProperties() { @Test public void roundTripWithBody() { - ImmutableS3SignRequest s3SignRequest = - ImmutableS3SignRequest.builder() + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .method("PUT") .region("us-west-2") @@ -210,8 +212,8 @@ public void roundTripWithBody() { .body("some-body") .build(); - String json = S3SignRequestParser.toJson(s3SignRequest, true); - assertThat(S3SignRequestParser.fromJson(json)).isEqualTo(s3SignRequest); + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); assertThat(json) .isEqualTo( "{\n" @@ -230,4 +232,46 @@ public void roundTripWithBody() { + " \"body\" : \"some-body\"\n" + "}"); } + + @Test + public void roundTripWithProvider() { + RemoteSignRequest request = + ImmutableRemoteSignRequest.builder() + .uri(URI.create("http://localhost:49208/iceberg-signer-test")) + .method("PUT") + .region("us-west-2") + .headers( + ImmutableMap.of( + "amz-sdk-request", + Arrays.asList("attempt=1", "max=4"), + "Content-Length", + Collections.singletonList("191"), + "Content-Type", + Collections.singletonList("application/json"), + "User-Agent", + Arrays.asList("aws-sdk-java/2.20.18", "Linux/5.4.0-126"))) + .properties(ImmutableMap.of("k1", "v1")) + .provider("s3") + .build(); + + String json = RemoteSignRequestParser.toJson(request, true); + assertThat(RemoteSignRequestParser.fromJson(json)).isEqualTo(request); + assertThat(json) + .isEqualTo( + "{\n" + + " \"region\" : \"us-west-2\",\n" + + " \"method\" : \"PUT\",\n" + + " \"uri\" : \"http://localhost:49208/iceberg-signer-test\",\n" + + " \"headers\" : {\n" + + " \"amz-sdk-request\" : [ \"attempt=1\", \"max=4\" ],\n" + + " \"Content-Length\" : [ \"191\" ],\n" + + " \"Content-Type\" : [ \"application/json\" ],\n" + + " \"User-Agent\" : [ \"aws-sdk-java/2.20.18\", \"Linux/5.4.0-126\" ]\n" + + " },\n" + + " \"properties\" : {\n" + + " \"k1\" : \"v1\"\n" + + " },\n" + + " \"provider\" : \"s3\"\n" + + "}"); + } } diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignResponseParser.java b/core/src/test/java/org/apache/iceberg/rest/responses/TestRemoteSignResponseParser.java similarity index 78% rename from aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignResponseParser.java rename to core/src/test/java/org/apache/iceberg/rest/responses/TestRemoteSignResponseParser.java index 19f2f540d765..b6d1178c3fa1 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/signer/TestS3SignResponseParser.java +++ b/core/src/test/java/org/apache/iceberg/rest/responses/TestRemoteSignResponseParser.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.apache.iceberg.aws.s3.signer; +package org.apache.iceberg.rest.responses; import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; @@ -28,28 +28,28 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; -public class TestS3SignResponseParser { +public class TestRemoteSignResponseParser { @Test public void nullResponse() { - assertThatThrownBy(() -> S3SignResponseParser.fromJson((JsonNode) null)) + assertThatThrownBy(() -> RemoteSignResponseParser.fromJson((JsonNode) null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Cannot parse s3 sign response from null object"); + .hasMessage("Cannot parse remote sign response from null object"); - assertThatThrownBy(() -> S3SignResponseParser.toJson(null)) + assertThatThrownBy(() -> RemoteSignResponseParser.toJson(null)) .isInstanceOf(IllegalArgumentException.class) - .hasMessage("Invalid s3 sign response: null"); + .hasMessage("Invalid remote sign response: null"); } @Test public void missingFields() { - assertThatThrownBy(() -> S3SignResponseParser.fromJson("{}")) + assertThatThrownBy(() -> RemoteSignResponseParser.fromJson("{}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing string: uri"); assertThatThrownBy( () -> - S3SignResponseParser.fromJson( + RemoteSignResponseParser.fromJson( "{\"uri\" : \"http://localhost:49208/iceberg-signer-test\"}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse missing field: headers"); @@ -57,15 +57,15 @@ public void missingFields() { @Test public void invalidUri() { - assertThatThrownBy(() -> S3SignResponseParser.fromJson("{\"uri\" : 45, \"headers\" : {}}}")) + assertThatThrownBy(() -> RemoteSignResponseParser.fromJson("{\"uri\" : 45, \"headers\" : {}}}")) .isInstanceOf(IllegalArgumentException.class) .hasMessage("Cannot parse to a string value: uri: 45"); } @Test public void roundTripSerde() { - S3SignResponse s3SignResponse = - ImmutableS3SignResponse.builder() + RemoteSignResponse response = + ImmutableRemoteSignResponse.builder() .uri(URI.create("http://localhost:49208/iceberg-signer-test")) .headers( ImmutableMap.of( @@ -79,8 +79,8 @@ public void roundTripSerde() { Arrays.asList("aws-sdk-java/2.20.18", "Linux/5.4.0-126"))) .build(); - String json = S3SignResponseParser.toJson(s3SignResponse, true); - assertThat(S3SignResponseParser.fromJson(json)).isEqualTo(s3SignResponse); + String json = RemoteSignResponseParser.toJson(response, true); + assertThat(RemoteSignResponseParser.fromJson(json)).isEqualTo(response); assertThat(json) .isEqualTo( "{\n" From f2e7a65678b4c529bb53b299450faf95ef24385b Mon Sep 17 00:00:00 2001 From: Akshay Thorat Date: Mon, 4 May 2026 01:32:05 -0700 Subject: [PATCH 151/197] AWS, GCP: add Kryo round-trip regression test for refreshed storage credentials (#16112) --- .../aws/s3/TestS3FileIOCredentialRefresh.java | 80 +++++++++++++++++++ .../gcs/TestGCSFileIOCredentialRefresh.java | 67 ++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java index 0a8b0e084873..170857ca84b4 100644 --- a/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java +++ b/aws/src/test/java/org/apache/iceberg/aws/s3/TestS3FileIOCredentialRefresh.java @@ -29,6 +29,7 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.TestHelpers; import org.apache.iceberg.aws.AwsProperties; import org.apache.iceberg.io.StorageCredential; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -265,4 +266,83 @@ public void credentialRefreshWithinFiveMinuteWindow() { }); } } + + @Test + public void refreshedCredentialsAreKryoSerializable() throws Exception { + // Verify that an S3FileIO whose credentials have been refreshed at runtime can still be + // round-tripped through Kryo. The internal storageCredentials list must be backed by a + // collection that Kryo can serialize and deserialize. + String nearExpiryMs = Long.toString(Instant.now().plus(3, ChronoUnit.MINUTES).toEpochMilli()); + + StorageCredential initialCredential = + StorageCredential.create( + "s3://bucket/path", + ImmutableMap.of( + S3FileIOProperties.ACCESS_KEY_ID, + "initialAccessKey", + S3FileIOProperties.SECRET_ACCESS_KEY, + "initialSecretKey", + S3FileIOProperties.SESSION_TOKEN, + "initialToken", + S3FileIOProperties.SESSION_TOKEN_EXPIRES_AT_MS, + nearExpiryMs)); + + String refreshedExpiryMs = + Long.toString(Instant.now().plus(1, ChronoUnit.HOURS).toEpochMilli()); + LoadCredentialsResponse refreshResponse = + ImmutableLoadCredentialsResponse.builder() + .addCredentials( + ImmutableCredential.builder() + .prefix("s3://bucket/path") + .config( + ImmutableMap.of( + S3FileIOProperties.ACCESS_KEY_ID, + "refreshedAccessKey", + S3FileIOProperties.SECRET_ACCESS_KEY, + "refreshedSecretKey", + S3FileIOProperties.SESSION_TOKEN, + "refreshedToken", + S3FileIOProperties.SESSION_TOKEN_EXPIRES_AT_MS, + refreshedExpiryMs)) + .build()) + .build(); + + HttpRequest mockRequest = request("/v1/credentials").withMethod(HttpMethod.GET.name()); + mockServer + .when(mockRequest) + .respond( + response(LoadCredentialsResponseParser.toJson(refreshResponse)).withStatusCode(200)); + + Map properties = + ImmutableMap.of( + AwsProperties.CLIENT_FACTORY, + StaticClientFactory.class.getName(), + VendedCredentialsProvider.URI, + CREDENTIALS_URI, + CatalogProperties.URI, + CATALOG_URI, + "init-creation-stacktrace", + "false"); + + StaticClientFactory.client = null; + try (S3FileIO fileIO = new S3FileIO()) { + fileIO.initialize(properties); + fileIO.setCredentials(List.of(initialCredential)); + + fileIO.client(); + + // Wait for the refresh to update the in-memory credentials + Awaitility.await() + .atMost(10, TimeUnit.SECONDS) + .untilAsserted( + () -> + assertThat(fileIO.credentials().get(0).config()) + .containsEntry(S3FileIOProperties.ACCESS_KEY_ID, "refreshedAccessKey")); + + // Round-trip through Kryo and verify the credentials still match + try (S3FileIO deserialized = TestHelpers.KryoHelpers.roundTripSerialize(fileIO)) { + assertThat(deserialized.credentials()).isEqualTo(fileIO.credentials()); + } + } + } } diff --git a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java index d0c05483add3..0b9bb37f5f90 100644 --- a/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java +++ b/gcp/src/test/java/org/apache/iceberg/gcp/gcs/TestGCSFileIOCredentialRefresh.java @@ -29,6 +29,7 @@ import java.util.Map; import java.util.concurrent.TimeUnit; import org.apache.iceberg.CatalogProperties; +import org.apache.iceberg.TestHelpers; import org.apache.iceberg.gcp.GCPProperties; import org.apache.iceberg.io.StorageCredential; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -226,4 +227,70 @@ void credentialRefreshWithinFiveMinuteWindow() { }); } } + + @Test + void refreshedCredentialsAreKryoSerializable() throws Exception { + // Verify that a GCSFileIO whose credentials have been refreshed at runtime can still be + // round-tripped through Kryo. The internal storageCredentials list must be backed by a + // collection that Kryo can serialize and deserialize. + String nearExpiryMs = Long.toString(Instant.now().plus(3, ChronoUnit.MINUTES).toEpochMilli()); + + StorageCredential initialCredential = + StorageCredential.create( + "gs://bucket/path", + ImmutableMap.of( + GCPProperties.GCS_OAUTH2_TOKEN, + "initialToken", + GCPProperties.GCS_OAUTH2_TOKEN_EXPIRES_AT, + nearExpiryMs)); + + String refreshedExpiryMs = + Long.toString(Instant.now().plus(1, ChronoUnit.HOURS).toEpochMilli()); + LoadCredentialsResponse refreshResponse = + ImmutableLoadCredentialsResponse.builder() + .addCredentials( + ImmutableCredential.builder() + .prefix("gs://bucket/path") + .config( + ImmutableMap.of( + GCPProperties.GCS_OAUTH2_TOKEN, + "refreshedToken", + GCPProperties.GCS_OAUTH2_TOKEN_EXPIRES_AT, + refreshedExpiryMs)) + .build()) + .build(); + + HttpRequest mockRequest = request("/v1/credentials").withMethod(HttpMethod.GET.name()); + mockServer + .when(mockRequest) + .respond( + response(LoadCredentialsResponseParser.toJson(refreshResponse)).withStatusCode(200)); + + Map properties = + ImmutableMap.of( + GCPProperties.GCS_OAUTH2_REFRESH_CREDENTIALS_ENDPOINT, + credentialsUri, + CatalogProperties.URI, + catalogUri); + + try (GCSFileIO fileIO = new GCSFileIO()) { + fileIO.initialize(properties); + fileIO.setCredentials(List.of(initialCredential)); + + fileIO.client(); + + // Wait for the refresh to update the in-memory credentials + Awaitility.await() + .atMost(10, TimeUnit.SECONDS) + .untilAsserted( + () -> + assertThat(fileIO.credentials().get(0).config()) + .containsEntry(GCPProperties.GCS_OAUTH2_TOKEN, "refreshedToken")); + + // Round-trip through Kryo and verify the credentials still match + try (GCSFileIO deserialized = TestHelpers.KryoHelpers.roundTripSerialize(fileIO)) { + assertThat(deserialized.credentials()).isEqualTo(fileIO.credentials()); + } + } + } } From 3a98658aef480559ee3d0d72c68d6ed27a32ecb1 Mon Sep 17 00:00:00 2001 From: gaborkaszab Date: Mon, 4 May 2026 16:37:33 +0200 Subject: [PATCH 152/197] Docs: Move catalog properties to catalog section (#15848) --- docs/docs/aws.md | 2 +- docs/docs/catalog-properties.md | 145 +++++++++++++++++++++++++++++++ docs/docs/configuration.md | 124 -------------------------- docs/docs/custom-catalog.md | 4 +- docs/docs/java-api-quickstart.md | 2 +- docs/docs/metrics-reporting.md | 2 +- docs/docs/spark-configuration.md | 2 +- docs/mkdocs.yml | 1 + 8 files changed, 152 insertions(+), 130 deletions(-) create mode 100644 docs/docs/catalog-properties.md diff --git a/docs/docs/aws.md b/docs/docs/aws.md index 587de402b069..fba4921f73a5 100644 --- a/docs/docs/aws.md +++ b/docs/docs/aws.md @@ -288,7 +288,7 @@ This feature requires the following lock related catalog properties: 2. Set `lock.table` as the DynamoDB table name you would like to use. If the lock table with the given name does not exist in DynamoDB, a new table is created with billing mode set as [pay-per-request](https://aws.amazon.com/blogs/aws/amazon-dynamodb-on-demand-no-capacity-planning-and-pay-per-request-pricing). Other lock related catalog properties can also be used to adjust locking behaviors such as heartbeat interval. -For more details, please refer to [Lock catalog properties](configuration.md#lock-catalog-properties). +For more details, please refer to [Lock catalog properties](catalog-properties.md#lock-catalog-properties). ## S3 FileIO diff --git a/docs/docs/catalog-properties.md b/docs/docs/catalog-properties.md new file mode 100644 index 000000000000..8609e1b1cefe --- /dev/null +++ b/docs/docs/catalog-properties.md @@ -0,0 +1,145 @@ +--- +title: "Catalog properties" +--- + + +# Catalog properties + +## Common properties + +Iceberg catalogs support using catalog properties to configure catalog behaviors. Here is a list of commonly used catalog properties: + +| Property | Default | Description | +| --------------------------------- | ------------------ | ------------------------------------------------------ | +| catalog-impl | null | a custom `Catalog` implementation to use by an engine | +| io-impl | null | a custom `FileIO` implementation to use in a catalog | +| warehouse | null | the root path of the data warehouse | +| uri | null | a URI string, such as Hive metastore URI | +| clients | 2 | client pool size | +| cache-enabled | true | Whether to cache catalog entries | +| cache.expiration-interval-ms | 30000 | How long catalog entries are locally cached, in milliseconds; 0 disables caching, negative values disable expiration | +| metrics-reporter-impl | org.apache.iceberg.metrics.LoggingMetricsReporter | Custom `MetricsReporter` implementation to use in a catalog. See the [Metrics reporting](metrics-reporting.md) section for additional details | +| unique-table-location | false | Whether to use a unique location for new tables | +| encryption.kms-impl | null | a custom `KeyManagementClient` implementation to use in a catalog for interactions with KMS (key management service). See the [Encryption](encryption.md) document for additional details | + +`HadoopCatalog` and `HiveCatalog` can access the properties in their constructors. +Any other custom catalog can access the properties by implementing `Catalog.initialize(catalogName, catalogProperties)`. +The properties can be manually constructed or passed in from a compute engine like Spark or Flink. +Spark uses its session properties as catalog properties, see more details in the [Spark configuration](spark-configuration.md#catalog-configuration) section. +Flink passes in catalog properties through `CREATE CATALOG` statement, see more details in the [Flink](flink.md#adding-catalogs) section. + +## REST Catalog auth properties + +The following catalog properties configure authentication for the REST catalog. +They support Basic, OAuth2, SigV4, and Google authentication. + +### REST auth properties + +| Property | Default | Description | +|--------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------| +| `rest.auth.type` | `none` | Authentication mechanism for REST catalog access. Supported values: `none`, `basic`, `oauth2`, `sigv4`, `google`. | +| `rest.auth.basic.username` | null | Username for Basic authentication. Required if `rest.auth.type` = `basic`. | +| `rest.auth.basic.password` | null | Password for Basic authentication. Required if `rest.auth.type` = `basic`. | +| `rest.auth.sigv4.delegate-auth-type` | `oauth2` | Auth type to delegate to after `sigv4` signing. | + +### OAuth2 auth properties +Required and optional properties to include while using `oauth2` authentication + +| Property | Default | Description | +|-------------------------|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `token` | null | A Bearer token to interact with the server. Either `token` or `credential` is required. | +| `credential` | null | Credential string in the form of `client_id:client_secret` to exchange for a token in the OAuth2 client credentials flow. Either `token` or `credential` is required. | +| `oauth2-server-uri` | `v1/oauth/tokens` | OAuth2 token endpoint URI. Required if the REST catalog is not the OAuth2 authentication server. | +| `token-expires-in-ms` | 3600000 (1 hour) | Time in milliseconds after which a bearer token is considered expired. Used to decide when to refresh or re-exchange a token. | +| `token-refresh-enabled` | true | Determines whether tokens are automatically refreshed when expiration details are available. | +| `token-exchange-enabled`| true | Determines whether to use the token exchange flow to acquire new tokens. Disabling this will allow fallback to the client credential flow. | +| `scope` | `catalog` | Additional scope for `oauth2`. | +| `audience` | null | Optional param to specify token `audience` | +| `resource` | null | Optional param to specify `resource` | + +### Google auth properties +Required and optional properties to include while using `google` authentication + +| Property | Default | Description | +|----------------------------|--------------------------------------------------|--------------------------------------------------| +| `gcp.auth.credentials-path`| Application Default Credentials (ADC) | Path to a service account JSON key file. | +| `gcp.auth.credentials-json` | Application Default Credentials (ADC) | JSON string of a service account credential. | +| `gcp.auth.scopes` | `https://www.googleapis.com/auth/cloud-platform` | Comma-separated list of OAuth scopes to request. | + +## Lock catalog properties + +Here are the catalog properties related to locking. They are used by some catalog implementations to control the locking behavior during commits. + +| Property | Default | Description | +| --------------------------------- | ------------------ | ------------------------------------------------------ | +| lock-impl | null | a custom implementation of the lock manager, the actual interface depends on the catalog used | +| lock.table | null | an auxiliary table for locking, such as in [AWS DynamoDB lock manager](aws.md#dynamodb-lock-manager) | +| lock.acquire-interval-ms | 5000 (5 s) | the interval to wait between each attempt to acquire a lock | +| lock.acquire-timeout-ms | 180000 (3 min) | the maximum time to try acquiring a lock | +| lock.heartbeat-interval-ms | 3000 (3 s) | the interval to wait between each heartbeat after acquiring a lock | +| lock.heartbeat-timeout-ms | 15000 (15 s) | the maximum time without a heartbeat to consider a lock expired | + +## Hadoop configuration + +### HadoopTables Lock Configuration + +When using `HadoopTables` (tables without a catalog), lock properties from the [Lock catalog properties](#lock-catalog-properties) section can be configured by prefixing them with `iceberg.tables.hadoop.`. This ensures atomic commits on file systems like S3 that lack native write mutual exclusion. + +!!! info + To use DynamoDB as a lock manager with `HadoopTables`, set `iceberg.tables.hadoop.lock-impl` to `org.apache.iceberg.aws.dynamodb.DynamoDbLockManager` and `iceberg.tables.hadoop.lock.table` to your DynamoDB table name. See [DynamoDB Lock Manager](aws.md#dynamodb-lock-manager) for more details. + +### Hive Metastore Configuration + +The following properties from the Hadoop configuration are used by the Hive Metastore connector. +The HMS table locking is a 2-step process: + +1. Lock Creation: Create lock in HMS and queue for acquisition +2. Lock Check: Check if lock successfully acquired + +| Property | Default | Description | +|-------------------------------------------|-----------------|------------------------------------------------------------------------------| +| iceberg.hive.client-pool-size | 5 | The size of the Hive client pool when tracking tables in HMS | +| iceberg.hive.lock-creation-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to create a lock in the HMS | +| iceberg.hive.lock-creation-min-wait-ms | 50 | Minimum time in milliseconds between retries of creating the lock in the HMS | +| iceberg.hive.lock-creation-max-wait-ms | 5000 | Maximum time in milliseconds between retries of creating the lock in the HMS | +| iceberg.hive.lock-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to acquire a lock | +| iceberg.hive.lock-check-min-wait-ms | 50 | Minimum time in milliseconds between checking the acquisition of the lock | +| iceberg.hive.lock-check-max-wait-ms | 5000 | Maximum time in milliseconds between checking the acquisition of the lock | +| iceberg.hive.lock-heartbeat-interval-ms | 240000 (4 min) | The heartbeat interval for the HMS locks. | +| iceberg.hive.metadata-refresh-max-retries | 2 | Maximum number of retries when the metadata file is missing | +| iceberg.hive.table-level-lock-evict-ms | 600000 (10 min) | The timeout for the JVM table lock is | +| iceberg.engine.hive.lock-enabled | true | Use HMS locks to ensure atomicity of commits | + +Note: `iceberg.hive.lock-check-max-wait-ms` and `iceberg.hive.lock-heartbeat-interval-ms` should be less than the [transaction timeout](https://cwiki.apache.org/confluence/display/Hive/Configuration+Properties#ConfigurationProperties-hive.txn.timeout) +of the Hive Metastore (`hive.txn.timeout` or `metastore.txn.timeout` in the newer versions). Otherwise, the heartbeats on the lock (which happens during the lock checks) would end up expiring in the +Hive Metastore before the lock is retried from Iceberg. + +Warn: Setting `iceberg.engine.hive.lock-enabled`=`false` will cause HiveCatalog to commit to tables without using Hive locks. +This should only be set to `false` if all following conditions are met: + +- [HIVE-26882](https://issues.apache.org/jira/browse/HIVE-26882) +is available on the Hive Metastore server +- [HIVE-28121](https://issues.apache.org/jira/browse/HIVE-28121) +is available on the Hive Metastore server, if it is backed by MySQL or MariaDB +- All other HiveCatalogs committing to tables that this HiveCatalog commits to are also on Iceberg 1.3 or later +- All other HiveCatalogs committing to tables that this HiveCatalog commits to have also disabled Hive locks on commit. + +**Failing to ensure these conditions risks corrupting the table.** + +Even with `iceberg.engine.hive.lock-enabled` set to `false`, a HiveCatalog can still use locks for individual tables by setting the table property `engine.hive.lock-enabled`=`true`. +This is useful in the case where other HiveCatalogs cannot be upgraded and set to commit without using Hive locks. diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index c1bdc80d11bc..88d9872cc683 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -141,127 +141,3 @@ Informational properties can be set to provide additional context about a table. | Property | Default | Description | | --------------------------------------------- | -------- | ------------------------------------------------------------- | | compatibility.snapshot-id-inheritance.enabled | false | Enables committing snapshots without explicit snapshot IDs (always true if the format version is > 1) | - -## Catalog properties - -Iceberg catalogs support using catalog properties to configure catalog behaviors. Here is a list of commonly used catalog properties: - -| Property | Default | Description | -| --------------------------------- | ------------------ | ------------------------------------------------------ | -| catalog-impl | null | a custom `Catalog` implementation to use by an engine | -| io-impl | null | a custom `FileIO` implementation to use in a catalog | -| warehouse | null | the root path of the data warehouse | -| uri | null | a URI string, such as Hive metastore URI | -| clients | 2 | client pool size | -| cache-enabled | true | Whether to cache catalog entries | -| cache.expiration-interval-ms | 30000 | How long catalog entries are locally cached, in milliseconds; 0 disables caching, negative values disable expiration | -| metrics-reporter-impl | org.apache.iceberg.metrics.LoggingMetricsReporter | Custom `MetricsReporter` implementation to use in a catalog. See the [Metrics reporting](metrics-reporting.md) section for additional details | -| unique-table-location | false | Whether to use a unique location for new tables | -| encryption.kms-impl | null | a custom `KeyManagementClient` implementation to use in a catalog for interactions with KMS (key management service). See the [Encryption](encryption.md) document for additional details | - -`HadoopCatalog` and `HiveCatalog` can access the properties in their constructors. -Any other custom catalog can access the properties by implementing `Catalog.initialize(catalogName, catalogProperties)`. -The properties can be manually constructed or passed in from a compute engine like Spark or Flink. -Spark uses its session properties as catalog properties, see more details in the [Spark configuration](spark-configuration.md#catalog-configuration) section. -Flink passes in catalog properties through `CREATE CATALOG` statement, see more details in the [Flink](flink.md#adding-catalogs) section. - -### REST Catalog auth properties - -The following catalog properties configure authentication for the REST catalog. -They support Basic, OAuth2, SigV4, and Google authentication. - -#### REST auth properties - -| Property | Default | Description | -|--------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------| -| `rest.auth.type` | `none` | Authentication mechanism for REST catalog access. Supported values: `none`, `basic`, `oauth2`, `sigv4`, `google`. | -| `rest.auth.basic.username` | null | Username for Basic authentication. Required if `rest.auth.type` = `basic`. | -| `rest.auth.basic.password` | null | Password for Basic authentication. Required if `rest.auth.type` = `basic`. | -| `rest.auth.sigv4.delegate-auth-type` | `oauth2` | Auth type to delegate to after `sigv4` signing. | - -#### OAuth2 auth properties -Required and optional properties to include while using `oauth2` authentication - -| Property | Default | Description | -|-------------------------|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `token` | null | A Bearer token to interact with the server. Either `token` or `credential` is required. | -| `credential` | null | Credential string in the form of `client_id:client_secret` to exchange for a token in the OAuth2 client credentials flow. Either `token` or `credential` is required. | -| `oauth2-server-uri` | `v1/oauth/tokens` | OAuth2 token endpoint URI. Required if the REST catalog is not the OAuth2 authentication server. | -| `token-expires-in-ms` | 3600000 (1 hour) | Time in milliseconds after which a bearer token is considered expired. Used to decide when to refresh or re-exchange a token. | -| `token-refresh-enabled` | true | Determines whether tokens are automatically refreshed when expiration details are available. | -| `token-exchange-enabled`| true | Determines whether to use the token exchange flow to acquire new tokens. Disabling this will allow fallback to the client credential flow. | -| `scope` | `catalog` | Additional scope for `oauth2`. | -| `audience` | null | Optional param to specify token `audience` | -| `resource` | null | Optional param to specify `resource` | - -#### Google auth properties -Required and optional properties to include while using `google` authentication - -| Property | Default | Description | -|----------------------------|--------------------------------------------------|--------------------------------------------------| -| `gcp.auth.credentials-path`| Application Default Credentials (ADC) | Path to a service account JSON key file. | -| `gcp.auth.credentials-json` | Application Default Credentials (ADC) | JSON string of a service account credential. | -| `gcp.auth.scopes` | `https://www.googleapis.com/auth/cloud-platform` | Comma-separated list of OAuth scopes to request. | - -### Lock catalog properties - -Here are the catalog properties related to locking. They are used by some catalog implementations to control the locking behavior during commits. - -| Property | Default | Description | -| --------------------------------- | ------------------ | ------------------------------------------------------ | -| lock-impl | null | a custom implementation of the lock manager, the actual interface depends on the catalog used | -| lock.table | null | an auxiliary table for locking, such as in [AWS DynamoDB lock manager](aws.md#dynamodb-lock-manager) | -| lock.acquire-interval-ms | 5000 (5 s) | the interval to wait between each attempt to acquire a lock | -| lock.acquire-timeout-ms | 180000 (3 min) | the maximum time to try acquiring a lock | -| lock.heartbeat-interval-ms | 3000 (3 s) | the interval to wait between each heartbeat after acquiring a lock | -| lock.heartbeat-timeout-ms | 15000 (15 s) | the maximum time without a heartbeat to consider a lock expired | - -## Hadoop configuration - -### HadoopTables Lock Configuration - -When using `HadoopTables` (tables without a catalog), lock properties from the [Lock catalog properties](#lock-catalog-properties) section can be configured by prefixing them with `iceberg.tables.hadoop.`. This ensures atomic commits on file systems like S3 that lack native write mutual exclusion. - -!!! info - To use DynamoDB as a lock manager with `HadoopTables`, set `iceberg.tables.hadoop.lock-impl` to `org.apache.iceberg.aws.dynamodb.DynamoDbLockManager` and `iceberg.tables.hadoop.lock.table` to your DynamoDB table name. See [DynamoDB Lock Manager](aws.md#dynamodb-lock-manager) for more details. - -### Hive Metastore Configuration - -The following properties from the Hadoop configuration are used by the Hive Metastore connector. -The HMS table locking is a 2-step process: - -1. Lock Creation: Create lock in HMS and queue for acquisition -2. Lock Check: Check if lock successfully acquired - -| Property | Default | Description | -|-------------------------------------------|-----------------|------------------------------------------------------------------------------| -| iceberg.hive.client-pool-size | 5 | The size of the Hive client pool when tracking tables in HMS | -| iceberg.hive.lock-creation-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to create a lock in the HMS | -| iceberg.hive.lock-creation-min-wait-ms | 50 | Minimum time in milliseconds between retries of creating the lock in the HMS | -| iceberg.hive.lock-creation-max-wait-ms | 5000 | Maximum time in milliseconds between retries of creating the lock in the HMS | -| iceberg.hive.lock-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to acquire a lock | -| iceberg.hive.lock-check-min-wait-ms | 50 | Minimum time in milliseconds between checking the acquisition of the lock | -| iceberg.hive.lock-check-max-wait-ms | 5000 | Maximum time in milliseconds between checking the acquisition of the lock | -| iceberg.hive.lock-heartbeat-interval-ms | 240000 (4 min) | The heartbeat interval for the HMS locks. | -| iceberg.hive.metadata-refresh-max-retries | 2 | Maximum number of retries when the metadata file is missing | -| iceberg.hive.table-level-lock-evict-ms | 600000 (10 min) | The timeout for the JVM table lock is | -| iceberg.engine.hive.lock-enabled | true | Use HMS locks to ensure atomicity of commits | - -Note: `iceberg.hive.lock-check-max-wait-ms` and `iceberg.hive.lock-heartbeat-interval-ms` should be less than the [transaction timeout](https://cwiki.apache.org/confluence/display/Hive/Configuration+Properties#ConfigurationProperties-hive.txn.timeout) -of the Hive Metastore (`hive.txn.timeout` or `metastore.txn.timeout` in the newer versions). Otherwise, the heartbeats on the lock (which happens during the lock checks) would end up expiring in the -Hive Metastore before the lock is retried from Iceberg. - -Warn: Setting `iceberg.engine.hive.lock-enabled`=`false` will cause HiveCatalog to commit to tables without using Hive locks. -This should only be set to `false` if all following conditions are met: - -- [HIVE-26882](https://issues.apache.org/jira/browse/HIVE-26882) -is available on the Hive Metastore server -- [HIVE-28121](https://issues.apache.org/jira/browse/HIVE-28121) -is available on the Hive Metastore server, if it is backed by MySQL or MariaDB -- All other HiveCatalogs committing to tables that this HiveCatalog commits to are also on Iceberg 1.3 or later -- All other HiveCatalogs committing to tables that this HiveCatalog commits to have also disabled Hive locks on commit. - -**Failing to ensure these conditions risks corrupting the table.** - -Even with `iceberg.engine.hive.lock-enabled` set to `false`, a HiveCatalog can still use locks for individual tables by setting the table property `engine.hive.lock-enabled`=`true`. -This is useful in the case where other HiveCatalogs cannot be upgraded and set to commit without using Hive locks. diff --git a/docs/docs/custom-catalog.md b/docs/docs/custom-catalog.md index f0a6b5718a6c..d30a629401aa 100644 --- a/docs/docs/custom-catalog.md +++ b/docs/docs/custom-catalog.md @@ -151,7 +151,7 @@ public class CustomCatalog extends BaseMetastoreCatalog { Catalog implementations can be dynamically loaded in most compute engines. For Spark and Flink, you can specify the `catalog-impl` catalog property to load it. -Read the [Configuration](configuration.md#catalog-properties) section for more details. +Read the [Configuration](catalog-properties.md) section for more details. For MapReduce, implement `org.apache.iceberg.mr.CatalogLoader` and set Hadoop property `iceberg.mr.catalog.loader.class` to load it. If your catalog must read Hadoop configuration to access certain environment properties, make your catalog implement `org.apache.hadoop.conf.Configurable`. @@ -199,7 +199,7 @@ public class CustomFileIO implements FileIO { If you are already implementing your own catalog, you can implement `TableOperations.io()` to use your custom `FileIO`. In addition, custom `FileIO` implementations can also be dynamically loaded in `HadoopCatalog` and `HiveCatalog` by specifying the `io-impl` catalog property. -Read the [Configuration](configuration.md#catalog-properties) section for more details. +Read the [Configuration](catalog-properties.md) section for more details. If your `FileIO` must read Hadoop configuration to access certain environment properties, make your `FileIO` implement `org.apache.hadoop.conf.Configurable`. ### Custom location provider implementation diff --git a/docs/docs/java-api-quickstart.md b/docs/docs/java-api-quickstart.md index 31f97bd441c8..74b40feed86d 100644 --- a/docs/docs/java-api-quickstart.md +++ b/docs/docs/java-api-quickstart.md @@ -28,7 +28,7 @@ Tables are created using either a [`Catalog`](../../javadoc/{{ icebergVersion }} The Hive catalog connects to a Hive metastore to keep track of Iceberg tables. You can initialize a Hive catalog with a name and some properties. -(see: [Catalog properties](configuration.md#catalog-properties)) +(see: [Catalog properties](catalog-properties.md)) ```java import java.util.HashMap; diff --git a/docs/docs/metrics-reporting.md b/docs/docs/metrics-reporting.md index e019e2761fe6..4ca452b0d503 100644 --- a/docs/docs/metrics-reporting.md +++ b/docs/docs/metrics-reporting.md @@ -145,7 +145,7 @@ public class InMemoryMetricsReporter implements MetricsReporter { ### Via Catalog Configuration -The [catalog property](configuration.md#catalog-properties) `metrics-reporter-impl` allows registering a given [`MetricsReporter`](https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/metrics/MetricsReporter.java) by specifying its fully-qualified class name, e.g. `metrics-reporter-impl=org.apache.iceberg.metrics.InMemoryMetricsReporter`. +The [catalog property](catalog-properties.md) `metrics-reporter-impl` allows registering a given [`MetricsReporter`](https://github.com/apache/iceberg/blob/main/api/src/main/java/org/apache/iceberg/metrics/MetricsReporter.java) by specifying its fully-qualified class name, e.g. `metrics-reporter-impl=org.apache.iceberg.metrics.InMemoryMetricsReporter`. ### Via the Java API during Scan planning diff --git a/docs/docs/spark-configuration.md b/docs/docs/spark-configuration.md index e8e4f7e3c8c1..2c15c3bbd7a5 100644 --- a/docs/docs/spark-configuration.md +++ b/docs/docs/spark-configuration.md @@ -80,7 +80,7 @@ Both catalogs are configured using properties nested under the catalog name. Com | spark.sql.catalog._catalog-name_.view-override._propertyKey_ | | Enforced Iceberg view property value for property key _propertyKey_, which cannot be overridden on view creation by user | | spark.sql.catalog._catalog-name_.use-nullable-query-schema | `true` or `false` | Whether to preserve fields' nullability when creating the table using CTAS and RTAS. If set to `true`, all fields will be marked as nullable. If set to `false`, fields' nullability will be preserved. The default value is `true`. Available in Spark 3.5 and above. | -Additional properties can be found in common [catalog configuration](configuration.md#catalog-properties). +Additional properties can be found in common [catalog configuration](catalog-properties.md). ### Using catalogs diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index c1807a6b8542..8e31aba5c98b 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -65,6 +65,7 @@ nav: - Hive Migration: hive-migration.md - Delta Lake Migration: delta-lake-migration.md - Catalogs: + - Catalog properties: catalog-properties.md - AWS Glue: aws/#glue-catalog - AWS DynamoDB: aws/#dynamodb-catalog - HadoopCatalog: https://iceberg.apache.org/javadoc/nightly/org/apache/iceberg/hadoop/HadoopCatalog.html From 54868576cd7458fe7f29ec5e1395c0b9b2e936c5 Mon Sep 17 00:00:00 2001 From: gaborkaszab Date: Mon, 4 May 2026 17:51:24 +0200 Subject: [PATCH 153/197] Docs: Document general REST catalog properties (#15871) --- docs/docs/catalog-properties.md | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/docs/docs/catalog-properties.md b/docs/docs/catalog-properties.md index 8609e1b1cefe..5afae0b98ae2 100644 --- a/docs/docs/catalog-properties.md +++ b/docs/docs/catalog-properties.md @@ -43,12 +43,34 @@ The properties can be manually constructed or passed in from a compute engine li Spark uses its session properties as catalog properties, see more details in the [Spark configuration](spark-configuration.md#catalog-configuration) section. Flink passes in catalog properties through `CREATE CATALOG` statement, see more details in the [Flink](flink.md#adding-catalogs) section. -## REST Catalog auth properties +## REST catalog properties + +The following properties configure the behavior of the REST catalog client. + +| Property | Default | Description | +|---------------------------------------|-------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `snapshot-loading-mode` | `ALL` | Controls how snapshots are loaded from the REST server. Supported values: `ALL` (load all snapshots), `REFS` (load only referenced snapshots). | +| `rest-metrics-reporting-enabled` | `true` | Whether to enable metrics reporting to the REST server. | +| `view-endpoints-supported` | `false` | For backwards compatibility with older REST servers. Set to `true` if the server supports view endpoints but doesn't send the `endpoints` field in the ConfigResponse. | +| `rest-page-size` | null | The page size to use when listing namespaces, tables, or other paginated resources. | +| `namespace-separator` | `%1F` | The separator character used for namespace levels when communicating with the REST server. | +| `scan-planning-mode` | `CLIENT` | Controls where scan planning is performed. Supported values: `CLIENT` (client-side planning), `SERVER` (server-side planning). Can be overridden per-table by the server in LoadTableResponse. | + +### Table cache properties + +The following properties configure the table cache used for freshness-aware table loading. Note, this cache is different from the one that can be configured at catalog level in general. + +| Property | Default | Description | +|------------------------------------------|-------------------|----------------------------------------------------------------------------------------| +| `rest-table-cache.expire-after-write-ms` | `300000` (5 min) | Time in milliseconds after which cached table entries expire. | +| `rest-table-cache.max-entries` | `100` | Maximum number of table entries to cache. | + +### Auth properties The following catalog properties configure authentication for the REST catalog. They support Basic, OAuth2, SigV4, and Google authentication. -### REST auth properties +#### REST auth properties | Property | Default | Description | |--------------------------------------|------------------|-------------------------------------------------------------------------------------------------------------------| @@ -57,7 +79,7 @@ They support Basic, OAuth2, SigV4, and Google authentication. | `rest.auth.basic.password` | null | Password for Basic authentication. Required if `rest.auth.type` = `basic`. | | `rest.auth.sigv4.delegate-auth-type` | `oauth2` | Auth type to delegate to after `sigv4` signing. | -### OAuth2 auth properties +#### OAuth2 auth properties Required and optional properties to include while using `oauth2` authentication | Property | Default | Description | @@ -72,7 +94,7 @@ Required and optional properties to include while using `oauth2` authentication | `audience` | null | Optional param to specify token `audience` | | `resource` | null | Optional param to specify `resource` | -### Google auth properties +#### Google auth properties Required and optional properties to include while using `google` authentication | Property | Default | Description | From 2d54125734ddc9b9fb87db147ff255918108fa2c Mon Sep 17 00:00:00 2001 From: milleniax <43858877+milleniax@users.noreply.github.com> Date: Mon, 4 May 2026 19:08:17 +0300 Subject: [PATCH 154/197] Spark: Support TimestampNTZ in SparkZOrderUDF (#15778) Co-authored-by: abdullin.marsel9 --- .../iceberg/spark/actions/SparkZOrderUDF.java | 28 +++++++++++++++++++ .../actions/TestRewriteDataFilesAction.java | 19 +++++++++++++ .../iceberg/spark/actions/SparkZOrderUDF.java | 28 +++++++++++++++++++ .../actions/TestRewriteDataFilesAction.java | 19 +++++++++++++ .../iceberg/spark/actions/SparkZOrderUDF.java | 28 +++++++++++++++++++ .../actions/TestRewriteDataFilesAction.java | 17 +++++++++++ .../iceberg/spark/actions/SparkZOrderUDF.java | 28 +++++++++++++++++++ .../actions/TestRewriteDataFilesAction.java | 17 +++++++++++ 8 files changed, 184 insertions(+) diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index db359fdd62fc..bf80dcb10b30 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); } else { diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 411b7e78116f..6abce5b24da0 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -22,6 +22,7 @@ import static org.apache.iceberg.data.FileHelpers.encrypt; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.col; import static org.apache.spark.sql.functions.current_date; import static org.apache.spark.sql.functions.date_add; import static org.apache.spark.sql.functions.expr; @@ -127,6 +128,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.sql.types.DataTypes; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -2573,6 +2575,23 @@ public void testExecutorCacheForDeleteFilesDisabled() { .isFalse(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + private double percentFilesRequired(Table table, String col, String value) { return percentFilesRequired(table, new String[] {col}, new String[] {value}); } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index db359fdd62fc..bf80dcb10b30 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); } else { diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index bcaa40d13c8a..d74d8a29f994 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -22,6 +22,7 @@ import static org.apache.iceberg.data.FileHelpers.encrypt; import static org.apache.iceberg.types.Types.NestedField.optional; import static org.apache.iceberg.types.Types.NestedField.required; +import static org.apache.spark.sql.functions.col; import static org.apache.spark.sql.functions.current_date; import static org.apache.spark.sql.functions.date_add; import static org.apache.spark.sql.functions.expr; @@ -127,6 +128,7 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.sql.types.DataTypes; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -2607,6 +2609,23 @@ public void testExecutorCacheForDeleteFilesDisabled() { .isFalse(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + private double percentFilesRequired(Table table, String col, String value) { return percentFilesRequired(table, new String[] {col}, new String[] {value}); } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index d142e3fd1aee..cf9cc8fd511a 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(functions.unix_date(column).cast(DataTypes.LongType)); } else { diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index b18f20c44427..38ddefd26a45 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -2129,6 +2129,23 @@ public void testZOrderUDFWithDateType() { assertThat(zorderBytes).isNotNull().isNotEmpty(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + protected void shouldRewriteDataFilesWithPartitionSpec(Table table, int outputSpecId) { List rewrittenFiles = currentDataFiles(table); assertThat(rewrittenFiles).allMatch(file -> file.specId() == outputSpecId); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java index d142e3fd1aee..cf9cc8fd511a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/actions/SparkZOrderUDF.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.nio.charset.CharsetEncoder; import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import org.apache.iceberg.util.DateTimeUtil; import org.apache.iceberg.util.ZOrderByteUtils; import org.apache.spark.sql.Column; import org.apache.spark.sql.expressions.UserDefinedFunction; @@ -40,6 +42,7 @@ import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.TimestampNTZType; import org.apache.spark.sql.types.TimestampType; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -180,6 +183,29 @@ value, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) return udf; } + private UserDefinedFunction timestampNtzToOrderedBytesUDF() { + int position = inputCol; + UserDefinedFunction udf = + functions + .udf( + (LocalDateTime value) -> { + if (value == null) { + return PRIMITIVE_EMPTY; + } + long micros = DateTimeUtil.microsFromTimestamp(value); + return ZOrderByteUtils.longToOrderedBytes( + micros, inputBuffer(position, ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE)) + .array(); + }, + DataTypes.BinaryType) + .withName("TIMESTAMP_NTZ_ORDERED_BYTES"); + + this.inputCol++; + increaseOutputSize(ZOrderByteUtils.PRIMITIVE_BUFFER_SIZE); + + return udf; + } + private UserDefinedFunction floatToOrderedBytesUDF() { int position = inputCol; UserDefinedFunction udf = @@ -309,6 +335,8 @@ Column sortedLexicographically(Column column, DataType type) { return booleanToOrderedBytesUDF().apply(column); } else if (type instanceof TimestampType) { return longToOrderedBytesUDF().apply(column.cast(DataTypes.LongType)); + } else if (type instanceof TimestampNTZType) { + return timestampNtzToOrderedBytesUDF().apply(column); } else if (type instanceof DateType) { return longToOrderedBytesUDF().apply(functions.unix_date(column).cast(DataTypes.LongType)); } else { diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java index 9524b0e7167d..110e43ede1f9 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteDataFilesAction.java @@ -2129,6 +2129,23 @@ public void testZOrderUDFWithDateType() { assertThat(zorderBytes).isNotNull().isNotEmpty(); } + @TestTemplate + public void testZOrderUDFWithTimestampNTZType() { + SparkZOrderUDF zorderUDF = new SparkZOrderUDF(1, 16, 1024); + Dataset result = + spark + .sql("SELECT timestamp_ntz '2025-01-01 12:00:00' as test_col") + .withColumn( + "zorder_result", + zorderUDF.sortedLexicographically(col("test_col"), DataTypes.TimestampNTZType)); + + assertThat(result.schema().apply("zorder_result").dataType()).isEqualTo(DataTypes.BinaryType); + List rows = result.collectAsList(); + Row row = rows.get(0); + byte[] zorderBytes = row.getAs("zorder_result"); + assertThat(zorderBytes).isNotNull().isNotEmpty(); + } + protected void shouldRewriteDataFilesWithPartitionSpec(Table table, int outputSpecId) { List rewrittenFiles = currentDataFiles(table); assertThat(rewrittenFiles).allMatch(file -> file.specId() == outputSpecId); From 7830efec3635893e39255823590b2422fde9171c Mon Sep 17 00:00:00 2001 From: Kurtis Wright Date: Tue, 5 May 2026 06:04:34 -0700 Subject: [PATCH 155/197] Spark: Add unknown type support to Spark 3.4 and 3.5 (#16066) * Add unknown type support to Spark 3.4 and 3.5 Map Iceberg's UnknownType to Spark's NullType in both directions: - TypeToSparkType: UNKNOWN -> NullType (Iceberg to Spark) - SparkTypeToType: NullType -> UnknownType (Spark to Iceberg) This aligns Spark 3.x with the existing Spark 4.x behavior and allows reading v3 tables with unknown-typed columns without throwing UnsupportedOperationException. Spark has supported NullType since 2.x. --- .../spark/PruneColumnsWithoutReordering.java | 2 + .../apache/iceberg/spark/SparkTypeToType.java | 3 + .../apache/iceberg/spark/TypeToSparkType.java | 5 +- .../data/ParquetWithSparkSchemaVisitor.java | 31 ++++++---- .../iceberg/spark/data/SparkOrcWriter.java | 21 +++++-- .../spark/data/SparkParquetWriters.java | 38 ++++++++++--- .../vectorized/VectorizedSparkOrcReaders.java | 2 + .../iceberg/spark/TestSparkSchemaUtil.java | 16 ++++++ .../iceberg/spark/data/AvroDataTestBase.java | 56 +++++++++++++++++-- .../spark/data/TestSparkOrcReader.java | 17 ++++++ .../spark/data/TestSparkParquetReader.java | 16 ++++++ .../data/TestSparkRecordOrcReaderWriter.java | 17 ++++++ .../iceberg/spark/source/ScanTestBase.java | 7 +-- .../spark/source/TestORCDataFrameWrite.java | 24 ++++++++ .../source/TestParquetDataFrameWrite.java | 24 ++++++++ .../iceberg/spark/source/TestParquetScan.java | 18 ++++++ .../spark/PruneColumnsWithoutReordering.java | 2 + .../apache/iceberg/spark/SparkTypeToType.java | 3 + .../apache/iceberg/spark/TypeToSparkType.java | 5 +- .../data/ParquetWithSparkSchemaVisitor.java | 31 ++++++---- .../iceberg/spark/data/SparkOrcWriter.java | 21 +++++-- .../spark/data/SparkParquetWriters.java | 39 ++++++++++--- .../iceberg/spark/TestSparkSchemaUtil.java | 16 ++++++ .../iceberg/spark/data/AvroDataTestBase.java | 56 +++++++++++++++++-- .../spark/data/TestSparkOrcReader.java | 17 ++++++ .../spark/data/TestSparkParquetReader.java | 16 ++++++ .../data/TestSparkRecordOrcReaderWriter.java | 17 ++++++ .../iceberg/spark/source/ScanTestBase.java | 6 +- .../spark/source/TestORCDataFrameWrite.java | 24 ++++++++ .../source/TestParquetDataFrameWrite.java | 24 ++++++++ .../iceberg/spark/source/TestParquetScan.java | 18 ++++++ 31 files changed, 524 insertions(+), 68 deletions(-) diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index fbd21f737450..fec413ca079a 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -41,6 +41,7 @@ import org.apache.spark.sql.types.IntegerType$; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -238,5 +239,6 @@ public Type primitive(Type.PrimitiveType primitive) { .put(TypeID.STRING, ImmutableSet.of(StringType$.class)) .put(TypeID.FIXED, ImmutableSet.of(BinaryType$.class)) .put(TypeID.BINARY, ImmutableSet.of(BinaryType$.class)) + .put(TypeID.UNKNOWN, ImmutableSet.of(NullType$.class)) .buildOrThrow(); } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index 8beaefc5cc8f..b7ed31c274d7 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -35,6 +35,7 @@ import org.apache.spark.sql.types.IntegerType; import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; import org.apache.spark.sql.types.StructField; @@ -155,6 +156,8 @@ public Type atomic(DataType atomic) { ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); + } else if (atomic instanceof NullType) { + return Types.UnknownType.get(); } throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index dfb9b30be603..d33632bbbd54 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -38,6 +38,7 @@ import org.apache.spark.sql.types.MapType$; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.MetadataBuilder; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType$; @@ -124,9 +125,11 @@ public DataType primitive(Type.PrimitiveType primitive) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; return DecimalType$.MODULE$.apply(decimal.precision(), decimal.scale()); + case UNKNOWN: + return NullType$.MODULE$; default: throw new UnsupportedOperationException( - "Cannot convert unknown type to Spark: " + primitive); + "Cannot convert unsupported type to Spark: " + primitive); } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index d74a76f94e87..2a2eef198b76 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -31,6 +31,7 @@ import org.apache.parquet.schema.Type.Repetition; import org.apache.spark.sql.types.ArrayType; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -181,21 +182,27 @@ private static T visitField( private static List visitFields( StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { - StructField[] sFields = struct.fields(); - Preconditions.checkArgument( - sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.length; i += 1) { - Type field = group.getFields().get(i); - StructField sField = sFields[i]; - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", - field.getName(), - sField.name()); - results.add(visitField(sField, field, visitor)); + + int fieldIndex = 0; + for (StructField sField : struct.fields()) { + if (sField.dataType() != DataTypes.NullType) { + Type field = group.getFields().get(fieldIndex); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); + results.add(visitField(sField, field, visitor)); + + fieldIndex += 1; + } } + // All the group fields should have been visited + Preconditions.checkArgument( + fieldIndex == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); + return results; } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 6b799e677bf4..6fc8849c82b2 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -20,6 +20,8 @@ import java.io.Serializable; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.iceberg.FieldMetrics; @@ -77,7 +79,7 @@ public OrcValueWriter record( TypeDescription record, List names, List> fields) { - return new InternalRowWriter(fields, record.getChildren()); + return new InternalRowWriter(fields, iStruct, record.getChildren()); } @Override @@ -133,12 +135,16 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, TypeDescriptio private static class InternalRowWriter extends GenericOrcWriters.StructWriter { private final List> fieldGetters; - InternalRowWriter(List> writers, List orcTypes) { - super(writers); + InternalRowWriter( + List> writers, Types.StructType iStruct, List orcTypes) { + super(iStruct, writers); this.fieldGetters = Lists.newArrayListWithExpectedSize(orcTypes.size()); - for (TypeDescription orcType : orcTypes) { - fieldGetters.add(createFieldGetter(orcType)); + Map idToType = + orcTypes.stream().collect(Collectors.toMap(ORCSchemaUtil::fieldId, s -> s)); + + for (Types.NestedField iField : iStruct.fields()) { + fieldGetters.add(createFieldGetter(idToType.get(iField.fieldId()))); } } @@ -149,6 +155,11 @@ protected Object get(InternalRow struct, int index) { } static FieldGetter createFieldGetter(TypeDescription fieldType) { + // In the case of an UnknownType + if (fieldType == null) { + return (row, ordinal) -> null; + } + final FieldGetter fieldGetter; switch (fieldType.getCategory()) { case BOOLEAN: diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index f4ae6114c8ab..a1ed6c66f337 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,6 +26,7 @@ import java.util.NoSuchElementException; import java.util.Optional; import java.util.UUID; +import java.util.stream.IntStream; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; import org.apache.iceberg.parquet.ParquetValueWriter; @@ -55,6 +56,7 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -94,15 +96,18 @@ public ParquetValueWriter message( public ParquetValueWriter struct( StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); - StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List sparkTypes = Lists.newArrayList(); for (int i = 0; i < fields.size(); i += 1) { writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - sparkTypes.add(sparkFields[i].dataType()); } - return new InternalRowWriter(writers, sparkTypes); + StructField[] sFields = sStruct.fields(); + DataType[] types = new DataType[sFields.length]; + for (int i = 0; i < sFields.length; i += 1) { + types[i] = sFields[i].dataType(); + } + + return new InternalRowWriter(writers, types); } @Override @@ -566,14 +571,33 @@ public Map.Entry next() { private static class InternalRowWriter extends ParquetValueWriters.StructWriter { private final DataType[] types; - private InternalRowWriter(List> writers, List types) { - super(writers); - this.types = types.toArray(new DataType[0]); + private InternalRowWriter(List> writers, DataType[] types) { + super(writerToFieldIndex(types, writers.size()), writers); + this.types = types; } @Override protected Object get(InternalRow struct, int index) { return struct.get(index, types[index]); } + + /** Returns a mapping from writer index to field index, skipping Unknown columns. */ + private static int[] writerToFieldIndex(DataType[] types, int numWriters) { + if (null == types) { + return IntStream.rangeClosed(0, numWriters).toArray(); + } + + // value writer index to record field index + int[] indexes = new int[numWriters]; + int writerIndex = 0; + for (int pos = 0; pos < types.length; pos += 1) { + if (!(types[pos] instanceof NullType)) { + indexes[writerIndex] = pos; + writerIndex += 1; + } + } + + return indexes; + } } } diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java index 8dceb075e604..4f324239881e 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java @@ -465,6 +465,8 @@ public ColumnVector convert( DeletedColumnVector deletedVector = new DeletedColumnVector(field.type()); deletedVector.setValue(new boolean[batchSize]); fieldVectors.add(deletedVector); + } else if (field.type().equals(Types.UnknownType.get())) { + fieldVectors.add(new ConstantColumnVector(field.type(), batchSize, null)); } else { fieldVectors.add( fieldConverters diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 9b5b207a5b6b..0846cf6f1161 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -24,9 +24,11 @@ import java.util.List; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.expressions.AttributeReference; import org.apache.spark.sql.catalyst.expressions.MetadataAttribute; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; @@ -79,4 +81,18 @@ public void testSchemaConversionWithMetaDataColumnSchema() { } } } + + @Test + public void testUnknownTypeToSpark() { + Schema schema = new Schema(optional(1, "col", Types.UnknownType.get())); + StructType sparkType = SparkSchemaUtil.convert(schema); + assertThat(sparkType.fields()[0].dataType()).isEqualTo(DataTypes.NullType); + } + + @Test + public void testNullTypeToIceberg() { + StructType sparkType = new StructType().add("col", DataTypes.NullType, true); + Type icebergType = SparkSchemaUtil.convert(sparkType).findField("col").type(); + assertThat(icebergType).isEqualTo(Types.UnknownType.get()); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java index 0db6a65fd394..45053c1a4f1f 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java @@ -32,6 +32,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -108,8 +109,8 @@ protected boolean supportsRowLineage() { required(114, "dec_9_0", Types.DecimalType.of(9, 0)), // int encoded required(115, "dec_11_2", Types.DecimalType.of(11, 2)), // long encoded required(116, "dec_20_5", Types.DecimalType.of(20, 5)), // requires padding - required(117, "dec_38_10", Types.DecimalType.of(38, 10)) // Spark's maximum precision - ); + required(117, "dec_38_10", Types.DecimalType.of(38, 10)), // Spark's maximum precision + optional(118, "unk", Types.UnknownType.get())); @TempDir protected Path temp; @@ -120,10 +121,13 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { + List supportedPrimitives = + SUPPORTED_PRIMITIVES.fields().stream() + .filter(f -> f.type().typeId() != Type.TypeID.UNKNOWN) + .collect(Collectors.toList()); writeAndValidate( TypeUtil.assignIncreasingFreshIds( - new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + new Schema(Lists.transform(supportedPrimitives, Types.NestedField::asRequired)))); } @Test @@ -603,4 +607,48 @@ public void testRowLineage() throws Exception { record.copy(Map.of("id", 4L, "data", "d", "_row_id", 1_001L)), record.copy(Map.of("id", 5L, "data", "e")))); } + + @Test + public void testUnknownNestedLevel() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(1, "id", LongType.get()), + optional( + 2, + "nested", + Types.StructType.of( + required(20, "int", Types.IntegerType.get()), + optional(21, "unk", Types.UnknownType.get())))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownListType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.UnknownType.get()))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownMapType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.UnknownType.get()))); + + writeAndValidate(schema); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java index 546a44fc77bb..a1f71848b14e 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java @@ -21,6 +21,7 @@ import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -106,4 +107,20 @@ private void writeAndValidateRecords(Schema schema, Iterable expect private Iterator batchesToRows(Iterator batches) { return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator)); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 9ae8b8cbe530..993dc868bba8 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -249,4 +249,20 @@ public void testMissingRequiredWithoutDefault() { .isInstanceOf(IllegalArgumentException.class) .hasMessage("Missing required field: missing_str"); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index 8e1f860085c6..3c88db139e47 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -152,4 +153,20 @@ private static void assertEqualsUnsafe( .isFalse(); assertThat(actualIter.hasNext()).as("Actual iterator should not have any extra rows").isFalse(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index c368c4a815fe..91d07e3647c9 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -95,14 +95,11 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw HadoopTables tables = new HadoopTables(CONF); // If V3 spec features are used, set the format version to 3 - Map tableProperties = - writeSchema.columns().stream() - .anyMatch(f -> f.initialDefaultLiteral() != null || f.writeDefaultLiteral() != null) - ? ImmutableMap.of(TableProperties.FORMAT_VERSION, "3") - : ImmutableMap.of(); + Map tableProperties = ImmutableMap.of(TableProperties.FORMAT_VERSION, "3"); Table table = tables.create( writeSchema, PartitionSpec.unpartitioned(), tableProperties, location.toString()); + configureTable(table); // Important: use the table's schema for the rest of the test // When tables are created, the column ids are reassigned. diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java index 35be6423ee23..892e260f66f0 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestORCDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java index 90a9ac48a486..c24d92ef30af 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestParquetDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index 6b9ec85b7f0b..6056f1a7929d 100644 --- a/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.4/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -19,6 +19,7 @@ package org.apache.iceberg.spark.source; import static org.apache.iceberg.Files.localOutput; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; import java.io.File; @@ -37,6 +38,7 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; public class TestParquetScan extends ScanTestBase { protected boolean vectorized() { @@ -84,4 +86,20 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw super.writeAndValidate(writeSchema, expectedSchema); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java index fbd21f737450..fec413ca079a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java @@ -41,6 +41,7 @@ import org.apache.spark.sql.types.IntegerType$; import org.apache.spark.sql.types.LongType$; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -238,5 +239,6 @@ public Type primitive(Type.PrimitiveType primitive) { .put(TypeID.STRING, ImmutableSet.of(StringType$.class)) .put(TypeID.FIXED, ImmutableSet.of(BinaryType$.class)) .put(TypeID.BINARY, ImmutableSet.of(BinaryType$.class)) + .put(TypeID.UNKNOWN, ImmutableSet.of(NullType$.class)) .buildOrThrow(); } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java index 8beaefc5cc8f..b7ed31c274d7 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/SparkTypeToType.java @@ -35,6 +35,7 @@ import org.apache.spark.sql.types.IntegerType; import org.apache.spark.sql.types.LongType; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StringType; import org.apache.spark.sql.types.StructField; @@ -155,6 +156,8 @@ public Type atomic(DataType atomic) { ((DecimalType) atomic).precision(), ((DecimalType) atomic).scale()); } else if (atomic instanceof BinaryType) { return Types.BinaryType.get(); + } else if (atomic instanceof NullType) { + return Types.UnknownType.get(); } throw new UnsupportedOperationException("Not a supported type: " + atomic.catalogString()); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java index dfb9b30be603..d33632bbbd54 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java @@ -38,6 +38,7 @@ import org.apache.spark.sql.types.MapType$; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.MetadataBuilder; +import org.apache.spark.sql.types.NullType$; import org.apache.spark.sql.types.StringType$; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType$; @@ -124,9 +125,11 @@ public DataType primitive(Type.PrimitiveType primitive) { case DECIMAL: Types.DecimalType decimal = (Types.DecimalType) primitive; return DecimalType$.MODULE$.apply(decimal.precision(), decimal.scale()); + case UNKNOWN: + return NullType$.MODULE$; default: throw new UnsupportedOperationException( - "Cannot convert unknown type to Spark: " + primitive); + "Cannot convert unsupported type to Spark: " + primitive); } } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java index 9480385d5452..e11a85d538a6 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/ParquetWithSparkSchemaVisitor.java @@ -31,6 +31,7 @@ import org.apache.parquet.schema.Type.Repetition; import org.apache.spark.sql.types.ArrayType; import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.MapType; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; @@ -173,21 +174,27 @@ private static T visitField( private static List visitFields( StructType struct, GroupType group, ParquetWithSparkSchemaVisitor visitor) { - StructField[] sFields = struct.fields(); - Preconditions.checkArgument( - sFields.length == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); List results = Lists.newArrayListWithExpectedSize(group.getFieldCount()); - for (int i = 0; i < sFields.length; i += 1) { - Type field = group.getFields().get(i); - StructField sField = sFields[i]; - Preconditions.checkArgument( - field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), - "Structs do not match: field %s != %s", - field.getName(), - sField.name()); - results.add(visitField(sField, field, visitor)); + + int fieldIndex = 0; + for (StructField sField : struct.fields()) { + if (sField.dataType() != DataTypes.NullType) { + Type field = group.getFields().get(fieldIndex); + Preconditions.checkArgument( + field.getName().equals(AvroSchemaUtil.makeCompatibleName(sField.name())), + "Structs do not match: field %s != %s", + field.getName(), + sField.name()); + results.add(visitField(sField, field, visitor)); + + fieldIndex += 1; + } } + // All the group fields should have been visited + Preconditions.checkArgument( + fieldIndex == group.getFieldCount(), "Structs do not match: %s and %s", struct, group); + return results; } diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java index 6b799e677bf4..6fc8849c82b2 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcWriter.java @@ -20,6 +20,8 @@ import java.io.Serializable; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; import org.apache.iceberg.FieldMetrics; @@ -77,7 +79,7 @@ public OrcValueWriter record( TypeDescription record, List names, List> fields) { - return new InternalRowWriter(fields, record.getChildren()); + return new InternalRowWriter(fields, iStruct, record.getChildren()); } @Override @@ -133,12 +135,16 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, TypeDescriptio private static class InternalRowWriter extends GenericOrcWriters.StructWriter { private final List> fieldGetters; - InternalRowWriter(List> writers, List orcTypes) { - super(writers); + InternalRowWriter( + List> writers, Types.StructType iStruct, List orcTypes) { + super(iStruct, writers); this.fieldGetters = Lists.newArrayListWithExpectedSize(orcTypes.size()); - for (TypeDescription orcType : orcTypes) { - fieldGetters.add(createFieldGetter(orcType)); + Map idToType = + orcTypes.stream().collect(Collectors.toMap(ORCSchemaUtil::fieldId, s -> s)); + + for (Types.NestedField iField : iStruct.fields()) { + fieldGetters.add(createFieldGetter(idToType.get(iField.fieldId()))); } } @@ -149,6 +155,11 @@ protected Object get(InternalRow struct, int index) { } static FieldGetter createFieldGetter(TypeDescription fieldType) { + // In the case of an UnknownType + if (fieldType == null) { + return (row, ordinal) -> null; + } + final FieldGetter fieldGetter; switch (fieldType.getCategory()) { case BOOLEAN: diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java index 58be7f610c81..a1ed6c66f337 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkParquetWriters.java @@ -26,6 +26,7 @@ import java.util.NoSuchElementException; import java.util.Optional; import java.util.UUID; +import java.util.stream.IntStream; import org.apache.iceberg.Schema; import org.apache.iceberg.parquet.ParquetValueReaders.ReusableEntry; import org.apache.iceberg.parquet.ParquetValueWriter; @@ -55,6 +56,7 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.Decimal; import org.apache.spark.sql.types.MapType; +import org.apache.spark.sql.types.NullType; import org.apache.spark.sql.types.ShortType; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; @@ -94,14 +96,18 @@ public ParquetValueWriter message( public ParquetValueWriter struct( StructType sStruct, GroupType struct, List> fieldWriters) { List fields = struct.getFields(); - StructField[] sparkFields = sStruct.fields(); List> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); - List sparkTypes = Lists.newArrayList(); for (int i = 0; i < fields.size(); i += 1) { writers.add(newOption(struct.getType(i), fieldWriters.get(i))); - sparkTypes.add(sparkFields[i].dataType()); } - return new InternalRowWriter(writers, sparkTypes); + + StructField[] sFields = sStruct.fields(); + DataType[] types = new DataType[sFields.length]; + for (int i = 0; i < sFields.length; i += 1) { + types[i] = sFields[i].dataType(); + } + + return new InternalRowWriter(writers, types); } @Override @@ -565,14 +571,33 @@ public Map.Entry next() { private static class InternalRowWriter extends ParquetValueWriters.StructWriter { private final DataType[] types; - private InternalRowWriter(List> writers, List types) { - super(writers); - this.types = types.toArray(new DataType[0]); + private InternalRowWriter(List> writers, DataType[] types) { + super(writerToFieldIndex(types, writers.size()), writers); + this.types = types; } @Override protected Object get(InternalRow struct, int index) { return struct.get(index, types[index]); } + + /** Returns a mapping from writer index to field index, skipping Unknown columns. */ + private static int[] writerToFieldIndex(DataType[] types, int numWriters) { + if (null == types) { + return IntStream.rangeClosed(0, numWriters).toArray(); + } + + // value writer index to record field index + int[] indexes = new int[numWriters]; + int writerIndex = 0; + for (int pos = 0; pos < types.length; pos += 1) { + if (!(types[pos] instanceof NullType)) { + indexes[writerIndex] = pos; + writerIndex += 1; + } + } + + return indexes; + } } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java index 4045847d5a4a..b8f436cf2d86 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/TestSparkSchemaUtil.java @@ -24,10 +24,12 @@ import java.util.List; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.spark.sql.catalyst.expressions.AttributeReference; import org.apache.spark.sql.catalyst.expressions.MetadataAttribute; import org.apache.spark.sql.catalyst.types.DataTypeUtils; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; @@ -80,4 +82,18 @@ public void testSchemaConversionWithMetaDataColumnSchema() { } } } + + @Test + public void testUnknownTypeToSpark() { + Schema schema = new Schema(optional(1, "col", Types.UnknownType.get())); + StructType sparkType = SparkSchemaUtil.convert(schema); + assertThat(sparkType.fields()[0].dataType()).isEqualTo(DataTypes.NullType); + } + + @Test + public void testNullTypeToIceberg() { + StructType sparkType = new StructType().add("col", DataTypes.NullType, true); + Type icebergType = SparkSchemaUtil.convert(sparkType).findField("col").type(); + assertThat(icebergType).isEqualTo(Types.UnknownType.get()); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java index 0db6a65fd394..45053c1a4f1f 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/AvroDataTestBase.java @@ -32,6 +32,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -108,8 +109,8 @@ protected boolean supportsRowLineage() { required(114, "dec_9_0", Types.DecimalType.of(9, 0)), // int encoded required(115, "dec_11_2", Types.DecimalType.of(11, 2)), // long encoded required(116, "dec_20_5", Types.DecimalType.of(20, 5)), // requires padding - required(117, "dec_38_10", Types.DecimalType.of(38, 10)) // Spark's maximum precision - ); + required(117, "dec_38_10", Types.DecimalType.of(38, 10)), // Spark's maximum precision + optional(118, "unk", Types.UnknownType.get())); @TempDir protected Path temp; @@ -120,10 +121,13 @@ public void testSimpleStruct() throws IOException { @Test public void testStructWithRequiredFields() throws IOException { + List supportedPrimitives = + SUPPORTED_PRIMITIVES.fields().stream() + .filter(f -> f.type().typeId() != Type.TypeID.UNKNOWN) + .collect(Collectors.toList()); writeAndValidate( TypeUtil.assignIncreasingFreshIds( - new Schema( - Lists.transform(SUPPORTED_PRIMITIVES.fields(), Types.NestedField::asRequired)))); + new Schema(Lists.transform(supportedPrimitives, Types.NestedField::asRequired)))); } @Test @@ -603,4 +607,48 @@ public void testRowLineage() throws Exception { record.copy(Map.of("id", 4L, "data", "d", "_row_id", 1_001L)), record.copy(Map.of("id", 5L, "data", "e")))); } + + @Test + public void testUnknownNestedLevel() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(1, "id", LongType.get()), + optional( + 2, + "nested", + Types.StructType.of( + required(20, "int", Types.IntegerType.get()), + optional(21, "unk", Types.UnknownType.get())))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownListType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional(1, "data", ListType.ofOptional(2, Types.UnknownType.get()))); + + writeAndValidate(schema); + } + + @Test + public void testUnknownMapType() throws IOException { + assumeThat(supportsNestedTypes()).isTrue(); + + Schema schema = + new Schema( + required(0, "id", LongType.get()), + optional( + 1, + "data", + MapType.ofOptional(2, 3, Types.StringType.get(), Types.UnknownType.get()))); + + writeAndValidate(schema); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java index 3b68a830b088..3fcfe6845c99 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkOrcReader.java @@ -21,6 +21,7 @@ import static org.apache.iceberg.spark.data.TestHelpers.assertEquals; import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -107,4 +108,20 @@ private void writeAndValidateRecords(Schema schema, Iterable expect private Iterator batchesToRows(Iterator batches) { return Iterators.concat(Iterators.transform(batches, ColumnarBatch::rowIterator)); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java index 328dcaa0014c..bc4b77059d43 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkParquetReader.java @@ -248,4 +248,20 @@ public void testMissingRequiredWithoutDefault() { .isInstanceOf(IllegalArgumentException.class) .hasMessage("Missing required field: missing_str"); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java index bf738be59cb8..634327a81d86 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/data/TestSparkRecordOrcReaderWriter.java @@ -20,6 +20,7 @@ import static org.apache.iceberg.types.Types.NestedField.required; import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import java.io.File; import java.io.IOException; @@ -150,4 +151,20 @@ private static void assertEqualsUnsafe( assertThat(expectedIter).as("Expected iterator should not have any extra rows.").isExhausted(); assertThat(actualIter).as("Actual iterator should not have any extra rows.").isExhausted(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java index 39ea25ae6f54..91d07e3647c9 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/ScanTestBase.java @@ -95,11 +95,7 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw HadoopTables tables = new HadoopTables(CONF); // If V3 spec features are used, set the format version to 3 - Map tableProperties = - writeSchema.columns().stream() - .anyMatch(f -> f.initialDefaultLiteral() != null || f.writeDefaultLiteral() != null) - ? ImmutableMap.of(TableProperties.FORMAT_VERSION, "3") - : ImmutableMap.of(); + Map tableProperties = ImmutableMap.of(TableProperties.FORMAT_VERSION, "3"); Table table = tables.create( writeSchema, PartitionSpec.unpartitioned(), tableProperties, location.toString()); diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java index 35be6423ee23..892e260f66f0 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestORCDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestORCDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.ORC.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create ListType with unknown element type"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot create MapType with unknown value type"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java index 90a9ac48a486..c24d92ef30af 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetDataFrameWrite.java @@ -18,9 +18,13 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; +import org.apache.spark.SparkException; +import org.junit.jupiter.api.Test; public class TestParquetDataFrameWrite extends DataFrameWriteTestBase { @Override @@ -30,4 +34,24 @@ protected void configureTable(Table table) { .set(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.PARQUET.toString()) .commit(); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(SparkException.class) + .cause() + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java index c0dee43d6de1..8b567bcaf11e 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestParquetScan.java @@ -19,6 +19,7 @@ package org.apache.iceberg.spark.source; import static org.apache.iceberg.Files.localOutput; +import static org.assertj.core.api.Assertions.assertThatThrownBy; import static org.assertj.core.api.Assumptions.assumeThat; import java.io.File; @@ -37,6 +38,7 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; public class TestParquetScan extends ScanTestBase { protected boolean vectorized() { @@ -83,4 +85,20 @@ protected void writeAndValidate(Schema writeSchema, Schema expectedSchema) throw super.writeAndValidate(writeSchema, expectedSchema); } + + @Test + @Override + public void testUnknownListType() { + assertThatThrownBy(super::testUnknownListType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert element Parquet: unknown"); + } + + @Test + @Override + public void testUnknownMapType() { + assertThatThrownBy(super::testUnknownMapType) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot convert value Parquet: unknown"); + } } From 0841cdea98645d14eb2394cc731d01713b67ad2b Mon Sep 17 00:00:00 2001 From: Soumyajit Sahu Date: Tue, 5 May 2026 08:41:50 -0700 Subject: [PATCH 156/197] Sink connector crashes on timestamps with fractional seconds and colon-separated UTC offset (Fixes #15838) (#15839) * handle fractional seconds in timestamp --------- Co-authored-by: Som Sahu --- .../iceberg/connect/data/RecordConverter.java | 17 +++++++--- .../connect/data/TestRecordConverter.java | 33 +++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java index 51f64a9d4b05..ab3d5aa9bb43 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordConverter.java @@ -768,10 +768,19 @@ private String ensureTimestampFormat(String str) { if (result.charAt(10) == ' ') { result = result.substring(0, 10) + 'T' + result.substring(11); } - if (result.length() > 22 - && (result.charAt(19) == '+' || result.charAt(19) == '-') - && result.charAt(22) == ':') { - result = result.substring(0, 19) + result.substring(19).replace(":", ""); + // Search for the timezone offset sign starting after the seconds portion (index 19+). + // With fractional seconds (e.g. "...T03:17:37.260514+00:00") the sign appears later + // than index 19, so we must locate it dynamically rather than assuming a fixed position. + int signIdx = -1; + for (int i = 19; i < result.length(); i++) { + char ch = result.charAt(i); + if (ch == '+' || ch == '-') { + signIdx = i; + break; + } + } + if (signIdx != -1 && signIdx + 3 < result.length() && result.charAt(signIdx + 3) == ':') { + result = result.substring(0, signIdx + 3) + result.substring(signIdx + 4); } return result; } diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java index 56a9b6e100ac..9b91ba61c167 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/TestRecordConverter.java @@ -578,6 +578,22 @@ public void testTimestampWithZoneConversion() { assertTimestampConvert(expected, additionalInput, TimestampType.withZone()); } + @Test + public void testTimestampWithZoneAndFractionalSecondsConversion() { + // Timestamps with sub-second precision and a colon-separated UTC offset (e.g. +00:00) + // were previously mis-parsed because ensureTimestampFormat only checked for the timezone + // sign at the fixed index 19, which is only valid when there are no fractional seconds. + OffsetDateTime expected = OffsetDateTime.parse("2026-03-31T03:17:37.260514+00:00"); + List inputs = + ImmutableList.of( + "2026-03-31T03:17:37.260514+00:00", + "2026-03-31T03:17:37.260514+0000", + "2026-03-31T03:17:37.260514Z", + "2026-03-31 03:17:37.260514+00:00", + "2026-03-31 03:17:37.260514+0000"); + assertTimestampConvert(expected, inputs, TimestampType.withZone()); + } + @Test public void testTimestampWithoutZoneConversion() { LocalDateTime expected = LocalDateTime.parse("2023-05-18T11:22:33"); @@ -596,6 +612,23 @@ public void testTimestampWithoutZoneConversion() { assertTimestampConvert(expected, additionalInput, TimestampType.withoutZone()); } + @Test + public void testTimestampWithoutZoneAndFractionalSecondsConversion() { + // Fractional seconds with a colon-separated offset: timezone must be stripped and + // the colon in +HH:MM must be normalized before OFFSET_TIMESTAMP_FORMAT can parse it. + LocalDateTime expected = LocalDateTime.parse("2026-03-31T03:17:37.260514"); + List inputs = + ImmutableList.of( + "2026-03-31T03:17:37.260514", + "2026-03-31 03:17:37.260514", + "2026-03-31T03:17:37.260514+00:00", + "2026-03-31 03:17:37.260514+00:00", + "2026-03-31T03:17:37.260514+0000", + "2026-03-31 03:17:37.260514+0000", + "2026-03-31T03:17:37.260514Z"); + assertTimestampConvert(expected, inputs, TimestampType.withoutZone()); + } + private void assertTimestampConvert(Temporal expected, long expectedMillis, TimestampType type) { List inputList = Lists.newArrayList( From 2f6606a247e2b16be46ca6c02fc4cfc2e17691e6 Mon Sep 17 00:00:00 2001 From: Swapna Marru Date: Tue, 5 May 2026 09:38:22 -0700 Subject: [PATCH 157/197] Flink: Backport: Dynamic sink options to be configurable in SQL (#16209) backports #15780 --- .../apache/iceberg/flink/FlinkConfParser.java | 2 +- .../sink/dynamic/DynamicIcebergSink.java | 71 ++++++----- .../flink/sink/dynamic/DynamicRecord.java | 4 + .../sink/dynamic/DynamicRecordProcessor.java | 36 ++++-- .../sink/dynamic/DynamicRecordWithConfig.java | 94 ++++++++++++++ .../dynamic/DynamicTableUpdateOperator.java | 19 +-- .../sink/dynamic/FlinkDynamicSinkConf.java | 102 +++++++++++++++ .../sink/dynamic/FlinkDynamicSinkOptions.java | 71 +++++++++++ .../flink/sink/dynamic/HashKeyGenerator.java | 5 +- .../sink/dynamic/TestDynamicIcebergSink.java | 105 +++++++++++++-- .../dynamic/TestDynamicRecordWithConfig.java | 120 ++++++++++++++++++ .../TestDynamicTableUpdateOperator.java | 56 +++----- .../sink/dynamic/TestHashKeyGenerator.java | 63 ++++++++- .../apache/iceberg/flink/FlinkConfParser.java | 2 +- .../sink/dynamic/DynamicIcebergSink.java | 71 ++++++----- .../flink/sink/dynamic/DynamicRecord.java | 4 + .../sink/dynamic/DynamicRecordProcessor.java | 36 ++++-- .../sink/dynamic/DynamicRecordWithConfig.java | 94 ++++++++++++++ .../dynamic/DynamicTableUpdateOperator.java | 19 +-- .../sink/dynamic/FlinkDynamicSinkConf.java | 102 +++++++++++++++ .../sink/dynamic/FlinkDynamicSinkOptions.java | 71 +++++++++++ .../flink/sink/dynamic/HashKeyGenerator.java | 5 +- .../sink/dynamic/TestDynamicIcebergSink.java | 105 +++++++++++++-- .../dynamic/TestDynamicRecordWithConfig.java | 120 ++++++++++++++++++ .../TestDynamicTableUpdateOperator.java | 56 +++----- .../sink/dynamic/TestHashKeyGenerator.java | 63 ++++++++- 26 files changed, 1262 insertions(+), 234 deletions(-) create mode 100644 flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java create mode 100644 flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java create mode 100644 flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java create mode 100644 flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordWithConfig.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e0672811cf5f..7661372c88e8 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -44,7 +44,7 @@ public FlinkConfParser(Table table, Map options, ReadableConfig this.readableConfig = readableConfig; } - FlinkConfParser(Map options, ReadableConfig readableConfig) { + public FlinkConfParser(Map options, ReadableConfig readableConfig) { this.tableProperties = ImmutableMap.of(); this.options = options; this.readableConfig = readableConfig; diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index ee56e39577e1..e7cd2c16459f 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -237,12 +237,6 @@ public static class Builder { private final Map snapshotSummary = Maps.newHashMap(); private ReadableConfig readableConfig = new Configuration(); private TableCreator tableCreator = TableCreator.DEFAULT; - private boolean immediateUpdate = false; - private boolean dropUnusedColumns = false; - private int cacheMaximumSize = 100; - private long cacheRefreshMs = 1_000; - private int inputSchemasPerTableCacheMaximumSize = 10; - private boolean caseSensitive = true; Builder() {} @@ -363,7 +357,9 @@ public Builder toBranch(String branch) { } public Builder immediateTableUpdate(boolean newImmediateUpdate) { - this.immediateUpdate = newImmediateUpdate; + writeOptions.put( + FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key(), + Boolean.toString(newImmediateUpdate)); return this; } @@ -379,19 +375,21 @@ public Builder immediateTableUpdate(boolean newImmediateUpdate) { * will never return data of the old column. */ public Builder dropUnusedColumns(boolean newDropUnusedColumns) { - this.dropUnusedColumns = newDropUnusedColumns; + writeOptions.put( + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), + Boolean.toString(newDropUnusedColumns)); return this; } /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ public Builder cacheMaxSize(int maxSize) { - this.cacheMaximumSize = maxSize; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key(), Integer.toString(maxSize)); return this; } /** Maximum interval for cache items renewals. */ public Builder cacheRefreshMs(long refreshMs) { - this.cacheRefreshMs = refreshMs; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key(), Long.toString(refreshMs)); return this; } @@ -401,7 +399,9 @@ public Builder cacheRefreshMs(long refreshMs) { * comparison results. */ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; + writeOptions.put( + FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key(), + Integer.toString(inputSchemasPerTableCacheMaxSize)); return this; } @@ -410,7 +410,8 @@ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCache * field names case-sensitive. */ public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; + writeOptions.put( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), Boolean.toString(newCaseSensitive)); return this; } @@ -426,14 +427,14 @@ private DynamicIcebergSink build( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - Configuration flinkConfig = - readableConfig instanceof Configuration - ? (Configuration) readableConfig - : Configuration.fromMap(readableConfig.toMap()); + Configuration flinkConfig = fromReadableConfig(); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, flinkConfig); // Forward writer: chained with generator via forward edge, no data shuffle ForwardWriterSink forwardWriterSink = - new ForwardWriterSink(catalogLoader, writeOptions, flinkConfig, cacheMaximumSize); + new ForwardWriterSink( + catalogLoader, writeOptions, flinkConfig, flinkDynamicSinkConf.cacheMaxSize()); TypeInformation> writeResultTypeInfo = CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); @@ -457,13 +458,15 @@ DynamicIcebergSink instantiateSink( Map writeProperties, Configuration flinkWriteConf, DataStream> forwardWriteResults) { + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeProperties, flinkWriteConf); return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize, + flinkDynamicSinkConf.cacheMaxSize(), forwardWriteResults); } @@ -487,10 +490,14 @@ DynamicIcebergSink instantiateSink( public DataStreamSink append() { uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, readableConfig); + Configuration flinkConfig = fromReadableConfig(); + DynamicRecordInternalType type = - new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); + new DynamicRecordInternalType(catalogLoader, false, flinkDynamicSinkConf.cacheMaxSize()); DynamicRecordInternalType sideOutputType = - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize); + new DynamicRecordInternalType(catalogLoader, true, flinkDynamicSinkConf.cacheMaxSize()); SingleOutputStreamOperator converted = input @@ -498,13 +505,10 @@ public DataStreamSink append() { new DynamicRecordProcessor<>( generator, catalogLoader, - immediateUpdate, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, tableCreator, - caseSensitive, - dropUnusedColumns)) + flinkDynamicSinkConf, + writeOptions, + flinkConfig)) .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) @@ -520,14 +524,7 @@ public DataStreamSink append() { DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( - new DynamicTableUpdateOperator( - catalogLoader, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, - tableCreator, - caseSensitive, - dropUnusedColumns)) + new DynamicTableUpdateOperator(catalogLoader, tableCreator, flinkDynamicSinkConf)) .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) @@ -545,6 +542,12 @@ public DataStreamSink append() { return result; } + + private Configuration fromReadableConfig() { + return readableConfig instanceof Configuration + ? (Configuration) readableConfig + : Configuration.fromMap(readableConfig.toMap()); + } } DataStream distributeDataStream(DataStream input) { diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 15b83a589382..6507a575c2af 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -20,6 +20,7 @@ import java.util.Set; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.table.data.RowData; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.PartitionSpec; @@ -39,6 +40,9 @@ public class DynamicRecord { private boolean upsertMode; @Nullable private Set equalityFields; + @Internal + DynamicRecord() {} + /** * Constructs a new DynamicRecord with forward (no shuffle) writes. * diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index fc6892b2cd9e..c752b8e9b8d9 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -18,10 +18,12 @@ */ package org.apache.iceberg.flink.sink.dynamic; +import java.util.Map; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.table.data.RowData; import org.apache.flink.util.Collector; @@ -30,6 +32,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkWriteConf; @Internal class DynamicRecordProcessor extends ProcessFunction @@ -41,6 +44,8 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; private final boolean immediateUpdate; private final boolean dropUnusedColumns; private final int cacheMaximumSize; @@ -55,27 +60,27 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; private transient OutputTag forwardStream; private transient Collector collector; + private transient DynamicRecordWithConfig dynamicRecordWithConfig; private transient Context context; DynamicRecordProcessor( DynamicRecordGenerator generator, CatalogLoader catalogLoader, - boolean immediateUpdate, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + FlinkDynamicSinkConf sinkConfig, + Map writeProperties, + Configuration flinkConfig) { this.generator = generator; this.catalogLoader = catalogLoader; - this.immediateUpdate = immediateUpdate; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + this.flinkConfig = flinkConfig; + this.writeProperties = writeProperties; + this.immediateUpdate = sinkConfig.immediateTableUpdate(); + this.cacheMaximumSize = sinkConfig.cacheMaxSize(); + this.cacheRefreshMs = sinkConfig.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = sinkConfig.inputSchemasPerTableCacheMaxSize(); this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + this.caseSensitive = sinkConfig.caseSensitive(); + this.dropUnusedColumns = sinkConfig.dropUnusedColumns(); } @Override @@ -107,6 +112,8 @@ public void open(OpenContext openContext) throws Exception { new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; } + this.dynamicRecordWithConfig = + new DynamicRecordWithConfig(new FlinkWriteConf(writeProperties, flinkConfig)); generator.open(openContext); } @@ -119,9 +126,10 @@ public void processElement(T element, Context ctx, Collector 0 || defaultWriteParallelism == null) { + return originalParallelism; + } + + return defaultWriteParallelism; + } + + @Override + public TableIdentifier tableIdentifier() { + return wrapped.tableIdentifier(); + } + + @Override + public Schema schema() { + return wrapped.schema(); + } + + @Override + public PartitionSpec spec() { + return wrapped.spec(); + } + + @Override + public RowData rowData() { + return wrapped.rowData(); + } + + @Override + public boolean upsertMode() { + return wrapped.upsertMode(); + } + + @Override + public Set equalityFields() { + return wrapped.equalityFields(); + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java index 456f20adf59f..93c268ff86ad 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java @@ -48,20 +48,15 @@ class DynamicTableUpdateOperator private transient TableUpdater updater; DynamicTableUpdateOperator( - CatalogLoader catalogLoader, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, - TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + CatalogLoader catalogLoader, TableCreator tableCreator, FlinkDynamicSinkConf configuration) { this.catalogLoader = catalogLoader; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + + this.cacheMaximumSize = configuration.cacheMaxSize(); + this.cacheRefreshMs = configuration.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = configuration.inputSchemasPerTableCacheMaxSize(); + this.caseSensitive = configuration.caseSensitive(); + this.dropUnusedColumns = configuration.dropUnusedColumns(); } @Override diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java new file mode 100644 index 000000000000..75b169c4b533 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.flink.FlinkConfParser; + +/** + * A class for common Dynamic Iceberg sink configs for Flink writes. + * + *

      If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

        + *
      1. Write options + *
      2. Flink ReadableConfig + *
      3. Default values + *
      + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the default values. + */ +class FlinkDynamicSinkConf { + + private final FlinkConfParser confParser; + + FlinkDynamicSinkConf(Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(writeOptions, readableConfig); + } + + int cacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean immediateTableUpdate() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key()) + .flinkConfig(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE) + .defaultValue(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.defaultValue()) + .parse(); + } + + boolean dropUnusedColumns() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key()) + .flinkConfig(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS) + .defaultValue(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.defaultValue()) + .parse(); + } + + long cacheRefreshMs() { + return confParser + .longConf() + .option(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_REFRESH_MS) + .defaultValue(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.defaultValue()) + .parse(); + } + + int inputSchemasPerTableCacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.CASE_SENSITIVE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CASE_SENSITIVE) + .defaultValue(FlinkDynamicSinkOptions.CASE_SENSITIVE.defaultValue()) + .parse(); + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java new file mode 100644 index 000000000000..7a4f038219d9 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +@Experimental +public class FlinkDynamicSinkOptions { + + private FlinkDynamicSinkOptions() {} + + public static final ConfigOption CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.cache-max-size") + .intType() + .defaultValue(100) + .withDescription( + "Maximum size of the caches used in Dynamic Sink for table data and serializers."); + + public static final ConfigOption IMMEDIATE_TABLE_UPDATE = + ConfigOptions.key("dynamic-sink.immediate-table-update") + .booleanType() + .defaultValue(false) + .withDescription( + "Controls whether table schema and partition updates should be applied immediately in Dynamic Sink."); + + public static final ConfigOption DROP_UNUSED_COLUMNS = + ConfigOptions.key("dynamic-sink.drop-unused-columns") + .booleanType() + .defaultValue(false) + .withDescription( + "Allows dropping unused columns during schema evolution in Dynamic Sink."); + + public static final ConfigOption CACHE_REFRESH_MS = + ConfigOptions.key("dynamic-sink.cache-refresh-ms") + .longType() + .defaultValue(1_000L) + .withDescription( + "Cache refresh interval for dynamic table metadata in Dynamic Sink in milliseconds."); + + public static final ConfigOption INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.input-schemas-per-table-cache-max-size") + .intType() + .defaultValue(10) + .withDescription( + "Maximum input schema objects to cache per each table in Dynamic Sink for performance."); + + public static final ConfigOption CASE_SENSITIVE = + ConfigOptions.key("dynamic-sink.case-sensitive") + .booleanType() + .defaultValue(true) + .withDescription( + "Controls whether schema field name matching should be case-sensitive in Dynamic Sink."); +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java index fca45bf882e0..61a850212bf4 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java @@ -88,7 +88,7 @@ int generateKey( dynamicRecord.schema(), dynamicRecord.spec(), dynamicRecord.equalityFields(), - MoreObjects.firstNonNull(dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism)); KeySelector keySelector = keySelectorCache.computeIfAbsent( @@ -98,8 +98,7 @@ int generateKey( tableIdent, MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), - MoreObjects.firstNonNull( - dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), MoreObjects.firstNonNull( dynamicRecord.equalityFields(), Collections.emptySet()), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism))); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index ecdbc3128525..bafd0276b7ce 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -77,6 +77,7 @@ import org.apache.iceberg.flink.CatalogLoader; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestHelpers; @@ -85,6 +86,7 @@ import org.apache.iceberg.flink.sink.dynamic.TestDynamicCommitter.FailBeforeAndAfterCommit; import org.apache.iceberg.inmemory.InMemoryInputFile; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -121,6 +123,7 @@ private static class DynamicIcebergDataImpl implements Serializable { PartitionSpec partitionSpec; boolean upsertMode; Set equalityFields; + int writeParallelism; private DynamicIcebergDataImpl( Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { @@ -132,7 +135,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -149,7 +153,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -168,7 +173,8 @@ private DynamicIcebergDataImpl( partitionSpec, upsertMode, equalityFields, - isDuplicate); + isDuplicate, + 10); } private DynamicIcebergDataImpl( @@ -179,7 +185,8 @@ private DynamicIcebergDataImpl( PartitionSpec partitionSpec, boolean upsertMode, Set equalityFields, - boolean isDuplicate) { + boolean isDuplicate, + int writeParallelism) { this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); this.rowExpected = isDuplicate ? null : rowProvided; this.schemaProvided = schemaProvided; @@ -189,6 +196,7 @@ private DynamicIcebergDataImpl( this.partitionSpec = partitionSpec; this.upsertMode = upsertMode; this.equalityFields = equalityFields; + this.writeParallelism = writeParallelism; } } @@ -208,7 +216,7 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { converter(schema).toInternal(row.rowProvided), spec, spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, - 10); + row.writeParallelism); dynamicRecord.setUpsertMode(row.upsertMode); dynamicRecord.setEqualityFields(row.equalityFields); out.collect(dynamicRecord); @@ -369,6 +377,19 @@ private void runForwardWriteTest(DynamicRecordGenerator verifyResults(rows); } + @Test + void testWriteWithNullBranch() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned())); + + runTest( + rows, this.env, false, 1, ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), "test-branch")); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1357,6 +1378,35 @@ void testGeneratorDefaultParallelism() { assertThat(generatorParallelism).isEqualTo(source.getParallelism()); } + @Test + void testFallBackParallelismFromConfig() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + -1), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + 0)); + + runTest( + rows, this.env, true, 2, ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "1")); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); @@ -1465,6 +1515,18 @@ private void runTest( verifyResults(dynamicData); } + private void runTest( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + Map writeProperties) + throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, null, false, writeProperties); + verifyResults(dynamicData, writeProperties); + } + private void executeDynamicSink( List dynamicData, StreamExecutionEnvironment env, @@ -1472,7 +1534,8 @@ private void executeDynamicSink( int parallelism, @Nullable CommitHook commitHook) throws Exception { - executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, commitHook, false); + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, false, Maps.newHashMap()); } private void executeDynamicSink( @@ -1483,6 +1546,19 @@ private void executeDynamicSink( @Nullable CommitHook commitHook, boolean overwrite) throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, overwrite, Maps.newHashMap()); + } + + private void executeDynamicSink( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + @Nullable CommitHook commitHook, + boolean overwrite, + Map writeProperties) + throws Exception { DataStream dataStream = env.fromData(dynamicData, TypeInformation.of(new TypeHint<>() {})); env.setParallelism(parallelism); @@ -1496,6 +1572,7 @@ private void executeDynamicSink( .immediateTableUpdate(immediateUpdate) .setSnapshotProperty("commit.retry.num-retries", "0") .overwrite(overwrite) + .setAll(writeProperties) .append(); } else { DynamicIcebergSink.forInput(dataStream) @@ -1504,6 +1581,7 @@ private void executeDynamicSink( .writeParallelism(parallelism) .immediateTableUpdate(immediateUpdate) .overwrite(overwrite) + .setAll(writeProperties) .append(); } @@ -1530,7 +1608,6 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100, forwardWriteResults); } } @@ -1547,7 +1624,6 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize, DataStream> forwardWritten) { super( catalogLoader, @@ -1555,7 +1631,7 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize, + 100, forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); @@ -1575,6 +1651,12 @@ public Committer createCommitter(CommitterInitContext contex } private void verifyResults(List dynamicData) throws IOException { + verifyResults(dynamicData, Maps.newHashMap()); + } + + private void verifyResults( + List dynamicData, Map writeProperties) + throws IOException { // Calculate the expected result Map, List> expectedData = Maps.newHashMap(); Map expectedSchema = Maps.newHashMap(); @@ -1588,9 +1670,12 @@ private void verifyResults(List dynamicData) throws IOEx dynamicData.forEach( r -> { + String branch = + MoreObjects.firstNonNull( + r.branch, writeProperties.get(FlinkWriteOptions.BRANCH.key())); List data = expectedData.computeIfAbsent( - Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); + Tuple2.of(r.tableName, branch), unused -> Lists.newArrayList()); data.addAll( convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); }); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java new file mode 100644 index 000000000000..de55621475ed --- /dev/null +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDynamicRecordWithConfig { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("db", "table"); + private static final PartitionSpec UNPARTITIONED = PartitionSpec.unpartitioned(); + private static final RowData ROW_DATA = GenericRowData.of(1, StringData.fromString("test")); + + @Test + void testBranchFallBack() { + String defaultBranch = "default-branch"; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), defaultBranch), new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(defaultBranch); + + String customBranch = "custom-branch"; + dynamicRecord.setBranch(customBranch); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(customBranch); + } + + @Test + void testWriteParallelismFallBack() { + int defaultParallelism = 4; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of( + FlinkWriteOptions.WRITE_PARALLELISM.key(), String.valueOf(defaultParallelism)), + new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED, null, -1); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(0); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(8); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()).isEqualTo(8); + } + + @Test + void testDelegatesToWrappedRecord() { + FlinkWriteConf conf = new FlinkWriteConf(Collections.emptyMap(), new Configuration()); + PartitionSpec partitioned = PartitionSpec.builderFor(SCHEMA).identity("id").build(); + Set equalityFields = ImmutableSet.of("id", "data"); + + DynamicRecord dynamicRecord = + new DynamicRecord( + TABLE_IDENTIFIER, + SnapshotRef.MAIN_BRANCH, + SCHEMA, + ROW_DATA, + partitioned, + DistributionMode.HASH, + 2); + dynamicRecord.setUpsertMode(true); + dynamicRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig record = new DynamicRecordWithConfig(conf).wrap(dynamicRecord); + + assertThat(record.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + assertThat(record.schema()).isEqualTo(SCHEMA); + assertThat(record.spec()).isEqualTo(partitioned); + assertThat(record.rowData()).isSameAs(ROW_DATA); + assertThat(record.distributionMode()).isEqualTo(DistributionMode.HASH); + assertThat(record.upsertMode()).isTrue(); + assertThat(record.equalityFields()).isEqualTo(equalityFields); + } +} diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java index 5d5a12418037..fdc12951264e 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java @@ -24,12 +24,14 @@ import java.util.Collections; import org.apache.flink.api.common.functions.OpenContext; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.GenericRowData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -58,9 +60,6 @@ class TestDynamicTableUpdateOperator { @Test void testDynamicTableUpdateOperatorNewTable() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -68,12 +67,8 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open((OpenContext) null); DynamicRecordInternal input = @@ -94,21 +89,14 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { @Test void testDynamicTableUpdateOperatorSchemaChange() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, SCHEMA1); @@ -136,9 +124,6 @@ void testDynamicTableUpdateOperatorSchemaChange() throws Exception { @ParameterizedTest @ValueSource(booleans = {true, false}) void testCaseInSensitivity(boolean caseSensitive) throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -149,12 +134,8 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - caseSensitive, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(caseSensitive, PRESERVE_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, initialSchema); @@ -188,21 +169,14 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { @Test void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, SCHEMA2); @@ -229,21 +203,14 @@ void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { @Test void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_INSENSITIVE, - DROP_COLUMNS); + flinkDynamicSinkConfiguration(CASE_INSENSITIVE, DROP_COLUMNS)); operator.open((OpenContext) null); catalog.createTable(table, SCHEMA2); @@ -266,4 +233,13 @@ void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { assertThat(tableSchema.findField("data")).isNull(); assertThat(input).isEqualTo(output); } + + private static FlinkDynamicSinkConf flinkDynamicSinkConfiguration( + boolean caseSensitive, boolean dropUnusedColumns) { + return new FlinkDynamicSinkConf( + ImmutableMap.of( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), String.valueOf(caseSensitive), + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), String.valueOf(dropUnusedColumns)), + new Configuration()); + } } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java index c65f96b12cbb..9a485fafaf47 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; @@ -34,6 +35,9 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -229,6 +233,38 @@ void testFailOnNonPositiveWriteParallelism() { }); } + @Test + void testNonPositiveWriteParallelismConfigFallback() throws Exception { + int maxWriteParallelism = 5; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + FlinkWriteConf flinkWriteConf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "2"), new Configuration()); + + Set writeKeys = Sets.newHashSet(); + for (int i = 0; i < 20; i++) { + GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); + writeKeys.add( + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + i % 2 == 0 ? 0 : -1, + Collections.emptySet(), + row, + flinkWriteConf)); + } + + assertThat(writeKeys).hasSize(2); + assertThat( + writeKeys.stream() + .map(key -> getSubTaskId(key, 2, maxWriteParallelism)) + .distinct() + .count()) + .isEqualTo(2); + } + @Test void testCapAtMaxWriteParallelism() throws Exception { int writeParallelism = 10; @@ -477,10 +513,31 @@ private static int getWriteKey( Set equalityFields, GenericRowData row) throws Exception { - DynamicRecord record = + return getWriteKey( + generator, + spec, + mode, + writeParallelism, + equalityFields, + row, + new FlinkWriteConf(Collections.emptyMap(), new Configuration())); + } + + private static int getWriteKey( + HashKeyGenerator generator, + PartitionSpec spec, + DistributionMode mode, + int writeParallelism, + Set equalityFields, + GenericRowData row, + FlinkWriteConf flinkWriteConf) + throws Exception { + DynamicRecord inputRecord = new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); - record.setEqualityFields(equalityFields); - return generator.generateKey(record); + inputRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(flinkWriteConf); + return generator.generateKey(dynamicRecordWithConfig.wrap(inputRecord)); } private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java index e0672811cf5f..7661372c88e8 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkConfParser.java @@ -44,7 +44,7 @@ public FlinkConfParser(Table table, Map options, ReadableConfig this.readableConfig = readableConfig; } - FlinkConfParser(Map options, ReadableConfig readableConfig) { + public FlinkConfParser(Map options, ReadableConfig readableConfig) { this.tableProperties = ImmutableMap.of(); this.options = options; this.readableConfig = readableConfig; diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java index 7b0de6fbe9e3..ad430cbf13f8 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicIcebergSink.java @@ -235,12 +235,6 @@ public static class Builder { private final Map snapshotSummary = Maps.newHashMap(); private ReadableConfig readableConfig = new Configuration(); private TableCreator tableCreator = TableCreator.DEFAULT; - private boolean immediateUpdate = false; - private boolean dropUnusedColumns = false; - private int cacheMaximumSize = 100; - private long cacheRefreshMs = 1_000; - private int inputSchemasPerTableCacheMaximumSize = 10; - private boolean caseSensitive = true; Builder() {} @@ -361,7 +355,9 @@ public Builder toBranch(String branch) { } public Builder immediateTableUpdate(boolean newImmediateUpdate) { - this.immediateUpdate = newImmediateUpdate; + writeOptions.put( + FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key(), + Boolean.toString(newImmediateUpdate)); return this; } @@ -377,19 +373,21 @@ public Builder immediateTableUpdate(boolean newImmediateUpdate) { * will never return data of the old column. */ public Builder dropUnusedColumns(boolean newDropUnusedColumns) { - this.dropUnusedColumns = newDropUnusedColumns; + writeOptions.put( + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), + Boolean.toString(newDropUnusedColumns)); return this; } /** Maximum size of the caches used in Dynamic Sink for table data and serializers. */ public Builder cacheMaxSize(int maxSize) { - this.cacheMaximumSize = maxSize; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key(), Integer.toString(maxSize)); return this; } /** Maximum interval for cache items renewals. */ public Builder cacheRefreshMs(long refreshMs) { - this.cacheRefreshMs = refreshMs; + writeOptions.put(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key(), Long.toString(refreshMs)); return this; } @@ -399,7 +397,9 @@ public Builder cacheRefreshMs(long refreshMs) { * comparison results. */ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCacheMaxSize) { - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaxSize; + writeOptions.put( + FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key(), + Integer.toString(inputSchemasPerTableCacheMaxSize)); return this; } @@ -408,7 +408,8 @@ public Builder inputSchemasPerTableCacheMaxSize(int inputSchemasPerTableCache * field names case-sensitive. */ public Builder caseSensitive(boolean newCaseSensitive) { - this.caseSensitive = newCaseSensitive; + writeOptions.put( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), Boolean.toString(newCaseSensitive)); return this; } @@ -424,14 +425,14 @@ private DynamicIcebergSink build( generator != null, "Please use withGenerator() to convert the input DataStream."); Preconditions.checkNotNull(catalogLoader, "Catalog loader shouldn't be null"); - Configuration flinkConfig = - readableConfig instanceof Configuration - ? (Configuration) readableConfig - : Configuration.fromMap(readableConfig.toMap()); + Configuration flinkConfig = fromReadableConfig(); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, flinkConfig); // Forward writer: chained with generator via forward edge, no data shuffle ForwardWriterSink forwardWriterSink = - new ForwardWriterSink(catalogLoader, writeOptions, flinkConfig, cacheMaximumSize); + new ForwardWriterSink( + catalogLoader, writeOptions, flinkConfig, flinkDynamicSinkConf.cacheMaxSize()); TypeInformation> writeResultTypeInfo = CommittableMessageTypeInfo.of(DynamicWriteResultSerializer::new); @@ -455,13 +456,15 @@ DynamicIcebergSink instantiateSink( Map writeProperties, Configuration flinkWriteConf, DataStream> forwardWriteResults) { + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeProperties, flinkWriteConf); return new DynamicIcebergSink( catalogLoader, snapshotSummary, uidPrefix, writeProperties, flinkWriteConf, - cacheMaximumSize, + flinkDynamicSinkConf.cacheMaxSize(), forwardWriteResults); } @@ -485,10 +488,14 @@ DynamicIcebergSink instantiateSink( public DataStreamSink append() { uidPrefix = Optional.ofNullable(uidPrefix).orElse(""); + FlinkDynamicSinkConf flinkDynamicSinkConf = + new FlinkDynamicSinkConf(writeOptions, readableConfig); + Configuration flinkConfig = fromReadableConfig(); + DynamicRecordInternalType type = - new DynamicRecordInternalType(catalogLoader, false, cacheMaximumSize); + new DynamicRecordInternalType(catalogLoader, false, flinkDynamicSinkConf.cacheMaxSize()); DynamicRecordInternalType sideOutputType = - new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize); + new DynamicRecordInternalType(catalogLoader, true, flinkDynamicSinkConf.cacheMaxSize()); SingleOutputStreamOperator converted = input @@ -496,13 +503,10 @@ public DataStreamSink append() { new DynamicRecordProcessor<>( generator, catalogLoader, - immediateUpdate, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, tableCreator, - caseSensitive, - dropUnusedColumns)) + flinkDynamicSinkConf, + writeOptions, + flinkConfig)) .setParallelism(input.getParallelism()) .uid(prefixIfNotNull(uidPrefix, "-generator")) .name(operatorName("generator")) @@ -518,14 +522,7 @@ public DataStreamSink append() { DynamicRecordProcessor.DYNAMIC_TABLE_UPDATE_STREAM, sideOutputType)) .keyBy((KeySelector) DynamicRecordInternal::tableName) .map( - new DynamicTableUpdateOperator( - catalogLoader, - cacheMaximumSize, - cacheRefreshMs, - inputSchemasPerTableCacheMaximumSize, - tableCreator, - caseSensitive, - dropUnusedColumns)) + new DynamicTableUpdateOperator(catalogLoader, tableCreator, flinkDynamicSinkConf)) .uid(prefixIfNotNull(uidPrefix, "-updater")) .name(operatorName("Updater")) .returns(type) @@ -543,6 +540,12 @@ public DataStreamSink append() { return result; } + + private Configuration fromReadableConfig() { + return readableConfig instanceof Configuration + ? (Configuration) readableConfig + : Configuration.fromMap(readableConfig.toMap()); + } } DataStream distributeDataStream(DataStream input) { diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java index 15b83a589382..6507a575c2af 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecord.java @@ -20,6 +20,7 @@ import java.util.Set; import javax.annotation.Nullable; +import org.apache.flink.annotation.Internal; import org.apache.flink.table.data.RowData; import org.apache.iceberg.DistributionMode; import org.apache.iceberg.PartitionSpec; @@ -39,6 +40,9 @@ public class DynamicRecord { private boolean upsertMode; @Nullable private Set equalityFields; + @Internal + DynamicRecord() {} + /** * Constructs a new DynamicRecord with forward (no shuffle) writes. * diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java index fc6892b2cd9e..c752b8e9b8d9 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicRecordProcessor.java @@ -18,10 +18,12 @@ */ package org.apache.iceberg.flink.sink.dynamic; +import java.util.Map; import org.apache.flink.annotation.Internal; import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.api.common.functions.OpenContext; import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.functions.ProcessFunction; import org.apache.flink.table.data.RowData; import org.apache.flink.util.Collector; @@ -30,6 +32,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.flink.CatalogLoader; +import org.apache.iceberg.flink.FlinkWriteConf; @Internal class DynamicRecordProcessor extends ProcessFunction @@ -41,6 +44,8 @@ class DynamicRecordProcessor extends ProcessFunction generator; private final CatalogLoader catalogLoader; + private final Map writeProperties; + private final Configuration flinkConfig; private final boolean immediateUpdate; private final boolean dropUnusedColumns; private final int cacheMaximumSize; @@ -55,27 +60,27 @@ class DynamicRecordProcessor extends ProcessFunction updateStream; private transient OutputTag forwardStream; private transient Collector collector; + private transient DynamicRecordWithConfig dynamicRecordWithConfig; private transient Context context; DynamicRecordProcessor( DynamicRecordGenerator generator, CatalogLoader catalogLoader, - boolean immediateUpdate, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + FlinkDynamicSinkConf sinkConfig, + Map writeProperties, + Configuration flinkConfig) { this.generator = generator; this.catalogLoader = catalogLoader; - this.immediateUpdate = immediateUpdate; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; + this.flinkConfig = flinkConfig; + this.writeProperties = writeProperties; + this.immediateUpdate = sinkConfig.immediateTableUpdate(); + this.cacheMaximumSize = sinkConfig.cacheMaxSize(); + this.cacheRefreshMs = sinkConfig.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = sinkConfig.inputSchemasPerTableCacheMaxSize(); this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + this.caseSensitive = sinkConfig.caseSensitive(); + this.dropUnusedColumns = sinkConfig.dropUnusedColumns(); } @Override @@ -107,6 +112,8 @@ public void open(OpenContext openContext) throws Exception { new DynamicRecordInternalType(catalogLoader, true, cacheMaximumSize)) {}; } + this.dynamicRecordWithConfig = + new DynamicRecordWithConfig(new FlinkWriteConf(writeProperties, flinkConfig)); generator.open(openContext); } @@ -119,9 +126,10 @@ public void processElement(T element, Context ctx, Collector 0 || defaultWriteParallelism == null) { + return originalParallelism; + } + + return defaultWriteParallelism; + } + + @Override + public TableIdentifier tableIdentifier() { + return wrapped.tableIdentifier(); + } + + @Override + public Schema schema() { + return wrapped.schema(); + } + + @Override + public PartitionSpec spec() { + return wrapped.spec(); + } + + @Override + public RowData rowData() { + return wrapped.rowData(); + } + + @Override + public boolean upsertMode() { + return wrapped.upsertMode(); + } + + @Override + public Set equalityFields() { + return wrapped.equalityFields(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java index 456f20adf59f..93c268ff86ad 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/DynamicTableUpdateOperator.java @@ -48,20 +48,15 @@ class DynamicTableUpdateOperator private transient TableUpdater updater; DynamicTableUpdateOperator( - CatalogLoader catalogLoader, - int cacheMaximumSize, - long cacheRefreshMs, - int inputSchemasPerTableCacheMaximumSize, - TableCreator tableCreator, - boolean caseSensitive, - boolean dropUnusedColumns) { + CatalogLoader catalogLoader, TableCreator tableCreator, FlinkDynamicSinkConf configuration) { this.catalogLoader = catalogLoader; - this.cacheMaximumSize = cacheMaximumSize; - this.cacheRefreshMs = cacheRefreshMs; - this.inputSchemasPerTableCacheMaximumSize = inputSchemasPerTableCacheMaximumSize; this.tableCreator = tableCreator; - this.caseSensitive = caseSensitive; - this.dropUnusedColumns = dropUnusedColumns; + + this.cacheMaximumSize = configuration.cacheMaxSize(); + this.cacheRefreshMs = configuration.cacheRefreshMs(); + this.inputSchemasPerTableCacheMaximumSize = configuration.inputSchemasPerTableCacheMaxSize(); + this.caseSensitive = configuration.caseSensitive(); + this.dropUnusedColumns = configuration.dropUnusedColumns(); } @Override diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java new file mode 100644 index 000000000000..75b169c4b533 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkConf.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import java.util.Map; +import org.apache.flink.configuration.ReadableConfig; +import org.apache.iceberg.flink.FlinkConfParser; + +/** + * A class for common Dynamic Iceberg sink configs for Flink writes. + * + *

      If a config is set at multiple levels, the following order of precedence is used (top to + * bottom): + * + *

        + *
      1. Write options + *
      2. Flink ReadableConfig + *
      3. Default values + *
      + * + * The most specific value is set in write options and takes precedence over all other configs. If + * no write option is provided, this class checks the flink configuration for any overrides. If no + * applicable value is found in the write options, this class uses the default values. + */ +class FlinkDynamicSinkConf { + + private final FlinkConfParser confParser; + + FlinkDynamicSinkConf(Map writeOptions, ReadableConfig readableConfig) { + this.confParser = new FlinkConfParser(writeOptions, readableConfig); + } + + int cacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean immediateTableUpdate() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.key()) + .flinkConfig(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE) + .defaultValue(FlinkDynamicSinkOptions.IMMEDIATE_TABLE_UPDATE.defaultValue()) + .parse(); + } + + boolean dropUnusedColumns() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key()) + .flinkConfig(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS) + .defaultValue(FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.defaultValue()) + .parse(); + } + + long cacheRefreshMs() { + return confParser + .longConf() + .option(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.key()) + .flinkConfig(FlinkDynamicSinkOptions.CACHE_REFRESH_MS) + .defaultValue(FlinkDynamicSinkOptions.CACHE_REFRESH_MS.defaultValue()) + .parse(); + } + + int inputSchemasPerTableCacheMaxSize() { + return confParser + .intConf() + .option(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.key()) + .flinkConfig(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE) + .defaultValue(FlinkDynamicSinkOptions.INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE.defaultValue()) + .parse(); + } + + boolean caseSensitive() { + return confParser + .booleanConf() + .option(FlinkDynamicSinkOptions.CASE_SENSITIVE.key()) + .flinkConfig(FlinkDynamicSinkOptions.CASE_SENSITIVE) + .defaultValue(FlinkDynamicSinkOptions.CASE_SENSITIVE.defaultValue()) + .parse(); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java new file mode 100644 index 000000000000..7a4f038219d9 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/FlinkDynamicSinkOptions.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import org.apache.flink.annotation.Experimental; +import org.apache.flink.configuration.ConfigOption; +import org.apache.flink.configuration.ConfigOptions; + +@Experimental +public class FlinkDynamicSinkOptions { + + private FlinkDynamicSinkOptions() {} + + public static final ConfigOption CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.cache-max-size") + .intType() + .defaultValue(100) + .withDescription( + "Maximum size of the caches used in Dynamic Sink for table data and serializers."); + + public static final ConfigOption IMMEDIATE_TABLE_UPDATE = + ConfigOptions.key("dynamic-sink.immediate-table-update") + .booleanType() + .defaultValue(false) + .withDescription( + "Controls whether table schema and partition updates should be applied immediately in Dynamic Sink."); + + public static final ConfigOption DROP_UNUSED_COLUMNS = + ConfigOptions.key("dynamic-sink.drop-unused-columns") + .booleanType() + .defaultValue(false) + .withDescription( + "Allows dropping unused columns during schema evolution in Dynamic Sink."); + + public static final ConfigOption CACHE_REFRESH_MS = + ConfigOptions.key("dynamic-sink.cache-refresh-ms") + .longType() + .defaultValue(1_000L) + .withDescription( + "Cache refresh interval for dynamic table metadata in Dynamic Sink in milliseconds."); + + public static final ConfigOption INPUT_SCHEMAS_PER_TABLE_CACHE_MAX_SIZE = + ConfigOptions.key("dynamic-sink.input-schemas-per-table-cache-max-size") + .intType() + .defaultValue(10) + .withDescription( + "Maximum input schema objects to cache per each table in Dynamic Sink for performance."); + + public static final ConfigOption CASE_SENSITIVE = + ConfigOptions.key("dynamic-sink.case-sensitive") + .booleanType() + .defaultValue(true) + .withDescription( + "Controls whether schema field name matching should be case-sensitive in Dynamic Sink."); +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java index fca45bf882e0..61a850212bf4 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/dynamic/HashKeyGenerator.java @@ -88,7 +88,7 @@ int generateKey( dynamicRecord.schema(), dynamicRecord.spec(), dynamicRecord.equalityFields(), - MoreObjects.firstNonNull(dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism)); KeySelector keySelector = keySelectorCache.computeIfAbsent( @@ -98,8 +98,7 @@ int generateKey( tableIdent, MoreObjects.firstNonNull(tableSchema, dynamicRecord.schema()), MoreObjects.firstNonNull(tableSpec, dynamicRecord.spec()), - MoreObjects.firstNonNull( - dynamicRecord.distributionMode(), DistributionMode.NONE), + dynamicRecord.distributionMode(), MoreObjects.firstNonNull( dynamicRecord.equalityFields(), Collections.emptySet()), Math.min(dynamicRecord.writeParallelism(), maxWriteParallelism))); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java index 4e7511501014..89befb9e8ea2 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicIcebergSink.java @@ -78,6 +78,7 @@ import org.apache.iceberg.flink.CatalogLoader; import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.MiniFlinkClusterExtension; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.TestHelpers; @@ -86,6 +87,7 @@ import org.apache.iceberg.flink.sink.dynamic.TestDynamicCommitter.FailBeforeAndAfterCommit; import org.apache.iceberg.inmemory.InMemoryInputFile; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.MoreObjects; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -122,6 +124,7 @@ private static class DynamicIcebergDataImpl implements Serializable { PartitionSpec partitionSpec; boolean upsertMode; Set equalityFields; + int writeParallelism; private DynamicIcebergDataImpl( Schema schemaProvided, String tableName, String branch, PartitionSpec partitionSpec) { @@ -133,7 +136,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -150,7 +154,8 @@ private DynamicIcebergDataImpl( partitionSpec, false, Collections.emptySet(), - false); + false, + 10); } private DynamicIcebergDataImpl( @@ -169,7 +174,8 @@ private DynamicIcebergDataImpl( partitionSpec, upsertMode, equalityFields, - isDuplicate); + isDuplicate, + 10); } private DynamicIcebergDataImpl( @@ -180,7 +186,8 @@ private DynamicIcebergDataImpl( PartitionSpec partitionSpec, boolean upsertMode, Set equalityFields, - boolean isDuplicate) { + boolean isDuplicate, + int writeParallelism) { this.rowProvided = randomRow(schemaProvided, isDuplicate ? seed : ++seed); this.rowExpected = isDuplicate ? null : rowProvided; this.schemaProvided = schemaProvided; @@ -190,6 +197,7 @@ private DynamicIcebergDataImpl( this.partitionSpec = partitionSpec; this.upsertMode = upsertMode; this.equalityFields = equalityFields; + this.writeParallelism = writeParallelism; } } @@ -209,7 +217,7 @@ public void generate(DynamicIcebergDataImpl row, Collector out) { converter(schema).toInternal(row.rowProvided), spec, spec.isPartitioned() ? DistributionMode.HASH : DistributionMode.NONE, - 10); + row.writeParallelism); dynamicRecord.setUpsertMode(row.upsertMode); dynamicRecord.setEqualityFields(row.equalityFields); out.collect(dynamicRecord); @@ -381,6 +389,19 @@ private void runForwardWriteTest(DynamicRecordGenerator verifyResults(rows); } + @Test + void testWriteWithNullBranch() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned()), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, "t1", null, PartitionSpec.unpartitioned())); + + runTest( + rows, this.env, false, 1, ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), "test-branch")); + } + @Test void testWritePartitioned() throws Exception { PartitionSpec spec = PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).bucket("id", 10).build(); @@ -1369,6 +1390,35 @@ void testGeneratorDefaultParallelism() { assertThat(generatorParallelism).isEqualTo(source.getParallelism()); } + @Test + void testFallBackParallelismFromConfig() throws Exception { + List rows = + Lists.newArrayList( + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + -1), + new DynamicIcebergDataImpl( + SimpleDataUtil.SCHEMA, + SimpleDataUtil.SCHEMA, + "t1", + SnapshotRef.MAIN_BRANCH, + PartitionSpec.unpartitioned(), + false, + Collections.emptySet(), + false, + 0)); + + runTest( + rows, this.env, true, 2, ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "1")); + } + private Set createSinkAndReturnUIds(String uidPrefix) { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); @@ -1477,6 +1527,18 @@ private void runTest( verifyResults(dynamicData); } + private void runTest( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + Map writeProperties) + throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, null, false, writeProperties); + verifyResults(dynamicData, writeProperties); + } + private void executeDynamicSink( List dynamicData, StreamExecutionEnvironment env, @@ -1484,7 +1546,8 @@ private void executeDynamicSink( int parallelism, @Nullable CommitHook commitHook) throws Exception { - executeDynamicSink(dynamicData, env, immediateUpdate, parallelism, commitHook, false); + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, false, Maps.newHashMap()); } private void executeDynamicSink( @@ -1495,6 +1558,19 @@ private void executeDynamicSink( @Nullable CommitHook commitHook, boolean overwrite) throws Exception { + executeDynamicSink( + dynamicData, env, immediateUpdate, parallelism, commitHook, overwrite, Maps.newHashMap()); + } + + private void executeDynamicSink( + List dynamicData, + StreamExecutionEnvironment env, + boolean immediateUpdate, + int parallelism, + @Nullable CommitHook commitHook, + boolean overwrite, + Map writeProperties) + throws Exception { DataStream dataStream = env.fromData(dynamicData, TypeInformation.of(new TypeHint<>() {})); env.setParallelism(parallelism); @@ -1508,6 +1584,7 @@ private void executeDynamicSink( .immediateTableUpdate(immediateUpdate) .setSnapshotProperty("commit.retry.num-retries", "0") .overwrite(overwrite) + .setAll(writeProperties) .append(); } else { DynamicIcebergSink.forInput(dataStream) @@ -1516,6 +1593,7 @@ private void executeDynamicSink( .writeParallelism(parallelism) .immediateTableUpdate(immediateUpdate) .overwrite(overwrite) + .setAll(writeProperties) .append(); } @@ -1542,7 +1620,6 @@ DynamicIcebergSink instantiateSink( "uidPrefix", writeProperties, flinkConfig, - 100, forwardWriteResults); } } @@ -1559,7 +1636,6 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { String uidPrefix, Map writeProperties, Configuration flinkConfig, - int cacheMaximumSize, DataStream> forwardWritten) { super( catalogLoader, @@ -1567,7 +1643,7 @@ static class CommitHookDynamicIcebergSink extends DynamicIcebergSink { uidPrefix, writeProperties, flinkConfig, - cacheMaximumSize, + 100, forwardWritten); this.commitHook = commitHook; this.overwriteMode = new FlinkWriteConf(writeProperties, flinkConfig).overwriteMode(); @@ -1587,6 +1663,12 @@ public Committer createCommitter(CommitterInitContext contex } private void verifyResults(List dynamicData) throws IOException { + verifyResults(dynamicData, Maps.newHashMap()); + } + + private void verifyResults( + List dynamicData, Map writeProperties) + throws IOException { // Calculate the expected result Map, List> expectedData = Maps.newHashMap(); Map expectedSchema = Maps.newHashMap(); @@ -1600,9 +1682,12 @@ private void verifyResults(List dynamicData) throws IOEx dynamicData.forEach( r -> { + String branch = + MoreObjects.firstNonNull( + r.branch, writeProperties.get(FlinkWriteOptions.BRANCH.key())); List data = expectedData.computeIfAbsent( - Tuple2.of(r.tableName, r.branch), unused -> Lists.newArrayList()); + Tuple2.of(r.tableName, branch), unused -> Lists.newArrayList()); data.addAll( convertToRowData(expectedSchema.get(r.tableName), ImmutableList.of(r.rowExpected))); }); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java new file mode 100644 index 000000000000..de55621475ed --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicRecordWithConfig.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink.dynamic; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Collections; +import java.util.Set; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.iceberg.DistributionMode; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.Test; + +class TestDynamicRecordWithConfig { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.required(2, "data", Types.StringType.get())); + + private static final TableIdentifier TABLE_IDENTIFIER = TableIdentifier.of("db", "table"); + private static final PartitionSpec UNPARTITIONED = PartitionSpec.unpartitioned(); + private static final RowData ROW_DATA = GenericRowData.of(1, StringData.fromString("test")); + + @Test + void testBranchFallBack() { + String defaultBranch = "default-branch"; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.BRANCH.key(), defaultBranch), new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(defaultBranch); + + String customBranch = "custom-branch"; + dynamicRecord.setBranch(customBranch); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).branch()).isEqualTo(customBranch); + } + + @Test + void testWriteParallelismFallBack() { + int defaultParallelism = 4; + FlinkWriteConf conf = + new FlinkWriteConf( + ImmutableMap.of( + FlinkWriteOptions.WRITE_PARALLELISM.key(), String.valueOf(defaultParallelism)), + new Configuration()); + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(conf); + + DynamicRecord dynamicRecord = + new DynamicRecord(TABLE_IDENTIFIER, null, SCHEMA, ROW_DATA, UNPARTITIONED, null, -1); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(0); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()) + .isEqualTo(defaultParallelism); + + dynamicRecord.writeParallelism(8); + assertThat(dynamicRecordWithConfig.wrap(dynamicRecord).writeParallelism()).isEqualTo(8); + } + + @Test + void testDelegatesToWrappedRecord() { + FlinkWriteConf conf = new FlinkWriteConf(Collections.emptyMap(), new Configuration()); + PartitionSpec partitioned = PartitionSpec.builderFor(SCHEMA).identity("id").build(); + Set equalityFields = ImmutableSet.of("id", "data"); + + DynamicRecord dynamicRecord = + new DynamicRecord( + TABLE_IDENTIFIER, + SnapshotRef.MAIN_BRANCH, + SCHEMA, + ROW_DATA, + partitioned, + DistributionMode.HASH, + 2); + dynamicRecord.setUpsertMode(true); + dynamicRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig record = new DynamicRecordWithConfig(conf).wrap(dynamicRecord); + + assertThat(record.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + assertThat(record.schema()).isEqualTo(SCHEMA); + assertThat(record.spec()).isEqualTo(partitioned); + assertThat(record.rowData()).isSameAs(ROW_DATA); + assertThat(record.distributionMode()).isEqualTo(DistributionMode.HASH); + assertThat(record.upsertMode()).isTrue(); + assertThat(record.equalityFields()).isEqualTo(equalityFields); + } +} diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java index 1c8e6df8591d..f6b2b368c2be 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicTableUpdateOperator.java @@ -23,12 +23,14 @@ import static org.assertj.core.api.Assertions.assertThat; import java.util.Collections; +import org.apache.flink.configuration.Configuration; import org.apache.flink.table.data.GenericRowData; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.flink.HadoopCatalogExtension; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.RegisterExtension; @@ -57,9 +59,6 @@ class TestDynamicTableUpdateOperator { @Test void testDynamicTableUpdateOperatorNewTable() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -67,12 +66,8 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); DynamicRecordInternal input = @@ -93,21 +88,14 @@ void testDynamicTableUpdateOperatorNewTable() throws Exception { @Test void testDynamicTableUpdateOperatorSchemaChange() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA1); @@ -135,9 +123,6 @@ void testDynamicTableUpdateOperatorSchemaChange() throws Exception { @ParameterizedTest @ValueSource(booleans = {true, false}) void testCaseInSensitivity(boolean caseSensitive) throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); @@ -148,12 +133,8 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - caseSensitive, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(caseSensitive, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, initialSchema); @@ -187,21 +168,14 @@ void testCaseInSensitivity(boolean caseSensitive) throws Exception { @Test void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_SENSITIVE, - PRESERVE_COLUMNS); + flinkDynamicSinkConfiguration(CASE_SENSITIVE, PRESERVE_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -228,21 +202,14 @@ void testDynamicTableUpdateOperatorPreserveUnusedColumns() throws Exception { @Test void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { - int cacheMaximumSize = 10; - int cacheRefreshMs = 1000; - int inputSchemaCacheMaximumSize = 10; Catalog catalog = CATALOG_EXTENSION.catalog(); TableIdentifier table = TableIdentifier.of(TABLE); DynamicTableUpdateOperator operator = new DynamicTableUpdateOperator( CATALOG_EXTENSION.catalogLoader(), - cacheMaximumSize, - cacheRefreshMs, - inputSchemaCacheMaximumSize, TableCreator.DEFAULT, - CASE_INSENSITIVE, - DROP_COLUMNS); + flinkDynamicSinkConfiguration(CASE_INSENSITIVE, DROP_COLUMNS)); operator.open(null); catalog.createTable(table, SCHEMA2); @@ -265,4 +232,13 @@ void testDynamicTableUpdateOperatorDropUnusedColumns() throws Exception { assertThat(tableSchema.findField("data")).isNull(); assertThat(input).isEqualTo(output); } + + private static FlinkDynamicSinkConf flinkDynamicSinkConfiguration( + boolean caseSensitive, boolean dropUnusedColumns) { + return new FlinkDynamicSinkConf( + ImmutableMap.of( + FlinkDynamicSinkOptions.CASE_SENSITIVE.key(), String.valueOf(caseSensitive), + FlinkDynamicSinkOptions.DROP_UNUSED_COLUMNS.key(), String.valueOf(dropUnusedColumns)), + new Configuration()); + } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java index c65f96b12cbb..9a485fafaf47 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestHashKeyGenerator.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Set; import org.apache.flink.api.java.functions.KeySelector; +import org.apache.flink.configuration.Configuration; import org.apache.flink.runtime.state.KeyGroupRangeAssignment; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; @@ -34,6 +35,9 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.flink.FlinkWriteConf; +import org.apache.iceberg.flink.FlinkWriteOptions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.Test; @@ -229,6 +233,38 @@ void testFailOnNonPositiveWriteParallelism() { }); } + @Test + void testNonPositiveWriteParallelismConfigFallback() throws Exception { + int maxWriteParallelism = 5; + HashKeyGenerator generator = new HashKeyGenerator(16, maxWriteParallelism); + PartitionSpec unpartitioned = PartitionSpec.unpartitioned(); + FlinkWriteConf flinkWriteConf = + new FlinkWriteConf( + ImmutableMap.of(FlinkWriteOptions.WRITE_PARALLELISM.key(), "2"), new Configuration()); + + Set writeKeys = Sets.newHashSet(); + for (int i = 0; i < 20; i++) { + GenericRowData row = GenericRowData.of(i, StringData.fromString("z")); + writeKeys.add( + getWriteKey( + generator, + unpartitioned, + DistributionMode.NONE, + i % 2 == 0 ? 0 : -1, + Collections.emptySet(), + row, + flinkWriteConf)); + } + + assertThat(writeKeys).hasSize(2); + assertThat( + writeKeys.stream() + .map(key -> getSubTaskId(key, 2, maxWriteParallelism)) + .distinct() + .count()) + .isEqualTo(2); + } + @Test void testCapAtMaxWriteParallelism() throws Exception { int writeParallelism = 10; @@ -477,10 +513,31 @@ private static int getWriteKey( Set equalityFields, GenericRowData row) throws Exception { - DynamicRecord record = + return getWriteKey( + generator, + spec, + mode, + writeParallelism, + equalityFields, + row, + new FlinkWriteConf(Collections.emptyMap(), new Configuration())); + } + + private static int getWriteKey( + HashKeyGenerator generator, + PartitionSpec spec, + DistributionMode mode, + int writeParallelism, + Set equalityFields, + GenericRowData row, + FlinkWriteConf flinkWriteConf) + throws Exception { + DynamicRecord inputRecord = new DynamicRecord(TABLE_IDENTIFIER, BRANCH, SCHEMA, row, spec, mode, writeParallelism); - record.setEqualityFields(equalityFields); - return generator.generateKey(record); + inputRecord.setEqualityFields(equalityFields); + + DynamicRecordWithConfig dynamicRecordWithConfig = new DynamicRecordWithConfig(flinkWriteConf); + return generator.generateKey(dynamicRecordWithConfig.wrap(inputRecord)); } private static int getSubTaskId(int writeKey1, int writeParallelism, int maxWriteParallelism) { From 0011a85e4d28b7bc8ff670a3a0d32a8331d87d0f Mon Sep 17 00:00:00 2001 From: drexler-sky Date: Tue, 5 May 2026 18:06:35 -0700 Subject: [PATCH 158/197] Spark: Migrate RollBackStageTable to use SupportsDeleteV2 (#16211) --- .../org/apache/iceberg/spark/RollbackStagedTable.java | 10 +++++----- .../org/apache/iceberg/spark/RollbackStagedTable.java | 10 +++++----- .../org/apache/iceberg/spark/RollbackStagedTable.java | 10 +++++----- .../org/apache/iceberg/spark/RollbackStagedTable.java | 10 +++++----- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java index bc8a966488ee..f1709277525a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/RollbackStagedTable.java @@ -24,17 +24,17 @@ import java.util.function.Function; import org.apache.spark.sql.connector.catalog.Identifier; import org.apache.spark.sql.connector.catalog.StagedTable; -import org.apache.spark.sql.connector.catalog.SupportsDelete; +import org.apache.spark.sql.connector.catalog.SupportsDeleteV2; import org.apache.spark.sql.connector.catalog.SupportsRead; import org.apache.spark.sql.connector.catalog.SupportsWrite; import org.apache.spark.sql.connector.catalog.Table; import org.apache.spark.sql.connector.catalog.TableCapability; import org.apache.spark.sql.connector.catalog.TableCatalog; import org.apache.spark.sql.connector.expressions.Transform; +import org.apache.spark.sql.connector.expressions.filter.Predicate; import org.apache.spark.sql.connector.read.ScanBuilder; import org.apache.spark.sql.connector.write.LogicalWriteInfo; import org.apache.spark.sql.connector.write.WriteBuilder; -import org.apache.spark.sql.sources.Filter; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.util.CaseInsensitiveStringMap; @@ -58,7 +58,7 @@ * #capabilities()}. */ public class RollbackStagedTable - implements StagedTable, SupportsRead, SupportsWrite, SupportsDelete { + implements StagedTable, SupportsRead, SupportsWrite, SupportsDeleteV2 { private final TableCatalog catalog; private final Identifier ident; private final Table table; @@ -106,8 +106,8 @@ public Set capabilities() { } @Override - public void deleteWhere(Filter[] filters) { - call(SupportsDelete.class, t -> t.deleteWhere(filters)); + public void deleteWhere(Predicate[] predicates) { + call(SupportsDeleteV2.class, t -> t.deleteWhere(predicates)); } @Override From da5ffce9a957130b562676680fefa2b8e2ac9d0a Mon Sep 17 00:00:00 2001 From: Neelesh Salian Date: Tue, 5 May 2026 19:55:43 -0700 Subject: [PATCH 159/197] Fix for vectorized builder variant handling (#16087) * Fix for vectorized builder variant handling * Simplify test query and add reg test * PR comment: add describedAs for keys * Add merge into test for spark 4.0 * PR comment: Add test for variant not in projection --- .../vectorized/VectorizedReaderBuilder.java | 10 ++ .../TestVectorizedReaderBuilder.java | 92 +++++++++++++++++++ .../spark/sql/TestSparkVariantRead.java | 49 ++++++++++ .../spark/sql/TestSparkVariantRead.java | 49 ++++++++++ 4 files changed, 200 insertions(+) create mode 100644 arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestVectorizedReaderBuilder.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java index 15b55fb48d4a..3fbd797c26fb 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorizedReaderBuilder.java @@ -154,6 +154,16 @@ public VectorizedReader struct( return null; } + @Override + public VectorizedReader variant( + Types.VariantType iVariant, GroupType variant, VectorizedReader result) { + if (iVariant != null) { + throw new UnsupportedOperationException( + "Vectorized reads are not supported yet for variant fields"); + } + return null; + } + @Override public VectorizedReader primitive( org.apache.iceberg.types.Type.PrimitiveType expected, PrimitiveType primitive) { diff --git a/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestVectorizedReaderBuilder.java b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestVectorizedReaderBuilder.java new file mode 100644 index 000000000000..e3d76515bcc7 --- /dev/null +++ b/arrow/src/test/java/org/apache/iceberg/arrow/vectorized/TestVectorizedReaderBuilder.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized; + +import static org.assertj.core.api.Assertions.assertThatNoException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.parquet.TypeWithSchemaVisitor; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.types.Types.IntegerType; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.types.Types.VariantType; +import org.apache.iceberg.variants.Variant; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; +import org.junit.jupiter.api.Test; + +public class TestVectorizedReaderBuilder { + + @Test + public void testVariantNotSupportedInVectorizedReads() { + Schema icebergSchema = + new Schema( + NestedField.required(1, "id", IntegerType.get()), + NestedField.optional(2, "data", VariantType.get())); + + MessageType parquetSchema = parquetSchemaWithVariant(); + + VectorizedReaderBuilder builder = + new VectorizedReaderBuilder( + icebergSchema, parquetSchema, false, ImmutableMap.of(), readers -> null); + + assertThatThrownBy( + () -> TypeWithSchemaVisitor.visit(icebergSchema.asStruct(), parquetSchema, builder)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("Vectorized reads are not supported yet for variant fields"); + } + + @Test + public void testVariantSkippedWhenNotInProjection() { + Schema icebergSchema = new Schema(NestedField.required(1, "id", IntegerType.get())); + + MessageType parquetSchema = parquetSchemaWithVariant(); + + VectorizedReaderBuilder builder = + new VectorizedReaderBuilder( + icebergSchema, parquetSchema, false, ImmutableMap.of(), readers -> null); + + assertThatNoException() + .describedAs("Variant not in projection should not throw") + .isThrownBy( + () -> TypeWithSchemaVisitor.visit(icebergSchema.asStruct(), parquetSchema, builder)); + } + + private static MessageType parquetSchemaWithVariant() { + return Types.buildMessage() + .addField( + Types.primitive(PrimitiveTypeName.INT32, Type.Repetition.REQUIRED).id(1).named("id")) + .addField( + Types.buildGroup(Type.Repetition.OPTIONAL) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("metadata")) + .addField( + Types.primitive(PrimitiveTypeName.BINARY, Type.Repetition.REQUIRED) + .named("value")) + .id(2) + .named("data")) + .named("table"); + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java index 599bf591e9a4..2d6e919a91ee 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java @@ -302,6 +302,55 @@ public void testNestedMapVariant(boolean vectorized) { sql("DROP TABLE IF EXISTS %s", mapTable); } + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testMergeIntoWithVariant(boolean vectorized) { + // Variant columns are not vectorized yet, but MERGE INTO should not crash regardless of the + // vectorization setting. The reader falls back to non-vectorized for variant columns. + String mergeTable = CATALOG + ".default.var_merge"; + sql("DROP TABLE IF EXISTS %s", mergeTable); + sql( + "CREATE TABLE %s (id BIGINT, data VARIANT) USING iceberg " + + "TBLPROPERTIES ('format-version'='3')", + mergeTable); + setVectorization(mergeTable, vectorized); + + sql( + "INSERT INTO %s VALUES " + + "(1, parse_json('{\"name\":\"alice\",\"age\":30}')), " + + "(2, parse_json('{\"name\":\"bob\",\"age\":25}'))", + mergeTable); + + sql( + "MERGE INTO %s AS target " + + "USING (SELECT 1 AS id, parse_json('{\"name\":\"alice\",\"age\":31}') AS data) AS source " + + "ON target.id = source.id " + + "WHEN MATCHED THEN UPDATE SET target.data = source.data " + + "WHEN NOT MATCHED THEN INSERT *", + mergeTable); + + List rows = spark.table(mergeTable).select("id", "data").orderBy("id").collectAsList(); + + assertThat(rows).hasSize(2); + assertThat(rows.get(0).getLong(0)).isEqualTo(1L); + Variant v1 = + new Variant( + ((VariantVal) rows.get(0).get(1)).getValue(), + ((VariantVal) rows.get(0).get(1)).getMetadata()); + assertThat(v1.getFieldByKey("name").getString()).describedAs("v1.name").isEqualTo("alice"); + assertThat(v1.getFieldByKey("age").getLong()).describedAs("v1.age").isEqualTo(31L); + + assertThat(rows.get(1).getLong(0)).isEqualTo(2L); + Variant v2 = + new Variant( + ((VariantVal) rows.get(1).get(1)).getValue(), + ((VariantVal) rows.get(1).get(1)).getMetadata()); + assertThat(v2.getFieldByKey("name").getString()).describedAs("v2.name").isEqualTo("bob"); + assertThat(v2.getFieldByKey("age").getLong()).describedAs("v2.age").isEqualTo(25L); + + sql("DROP TABLE IF EXISTS %s", mergeTable); + } + private void setVectorization(boolean on) { sql( "ALTER TABLE %s SET TBLPROPERTIES ('read.parquet.vectorization.enabled'='%s')", diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java index 599bf591e9a4..2d6e919a91ee 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkVariantRead.java @@ -302,6 +302,55 @@ public void testNestedMapVariant(boolean vectorized) { sql("DROP TABLE IF EXISTS %s", mapTable); } + @ParameterizedTest + @ValueSource(booleans = {false, true}) + public void testMergeIntoWithVariant(boolean vectorized) { + // Variant columns are not vectorized yet, but MERGE INTO should not crash regardless of the + // vectorization setting. The reader falls back to non-vectorized for variant columns. + String mergeTable = CATALOG + ".default.var_merge"; + sql("DROP TABLE IF EXISTS %s", mergeTable); + sql( + "CREATE TABLE %s (id BIGINT, data VARIANT) USING iceberg " + + "TBLPROPERTIES ('format-version'='3')", + mergeTable); + setVectorization(mergeTable, vectorized); + + sql( + "INSERT INTO %s VALUES " + + "(1, parse_json('{\"name\":\"alice\",\"age\":30}')), " + + "(2, parse_json('{\"name\":\"bob\",\"age\":25}'))", + mergeTable); + + sql( + "MERGE INTO %s AS target " + + "USING (SELECT 1 AS id, parse_json('{\"name\":\"alice\",\"age\":31}') AS data) AS source " + + "ON target.id = source.id " + + "WHEN MATCHED THEN UPDATE SET target.data = source.data " + + "WHEN NOT MATCHED THEN INSERT *", + mergeTable); + + List rows = spark.table(mergeTable).select("id", "data").orderBy("id").collectAsList(); + + assertThat(rows).hasSize(2); + assertThat(rows.get(0).getLong(0)).isEqualTo(1L); + Variant v1 = + new Variant( + ((VariantVal) rows.get(0).get(1)).getValue(), + ((VariantVal) rows.get(0).get(1)).getMetadata()); + assertThat(v1.getFieldByKey("name").getString()).describedAs("v1.name").isEqualTo("alice"); + assertThat(v1.getFieldByKey("age").getLong()).describedAs("v1.age").isEqualTo(31L); + + assertThat(rows.get(1).getLong(0)).isEqualTo(2L); + Variant v2 = + new Variant( + ((VariantVal) rows.get(1).get(1)).getValue(), + ((VariantVal) rows.get(1).get(1)).getMetadata()); + assertThat(v2.getFieldByKey("name").getString()).describedAs("v2.name").isEqualTo("bob"); + assertThat(v2.getFieldByKey("age").getLong()).describedAs("v2.age").isEqualTo(25L); + + sql("DROP TABLE IF EXISTS %s", mergeTable); + } + private void setVectorization(boolean on) { sql( "ALTER TABLE %s SET TBLPROPERTIES ('read.parquet.vectorization.enabled'='%s')", From dcdeb27edcfaedfccd1e96d3a7c1f1b4e10e898b Mon Sep 17 00:00:00 2001 From: Talat UYARER Date: Tue, 5 May 2026 22:57:19 -0700 Subject: [PATCH 160/197] Flink: Define Joda Time in libs.versions.toml file (#16191) --- flink/v2.1/build.gradle | 2 +- gradle/libs.versions.toml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index 9eb09cf021f9..9437189305a2 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -33,7 +33,7 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink21.avro - compileOnly 'joda-time:joda-time:2.8.1' + compileOnly libs.joda.time // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink21.metrics.dropwizard compileOnly libs.flink21.streaming.java diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c666281e6eae..c43f805fd1d7 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -67,6 +67,7 @@ jakarta-servlet-api = "6.1.0" jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" jetty = "12.1.8" +joda = "2.5" junit = "5.14.3" junit-platform = "1.14.3" junit-pioneer = "2.3.0" @@ -209,6 +210,7 @@ jakarta-servlet = {module = "jakarta.servlet:jakarta.servlet-api", version.ref = jetty-compression-server = { module = "org.eclipse.jetty.compression:jetty-compression-server", version.ref = "jetty" } jetty-compression-gzip = { module = "org.eclipse.jetty.compression:jetty-compression-gzip", version.ref = "jetty" } jetty-servlet = { module = "org.eclipse.jetty.ee10:jetty-ee10-servlet", version.ref = "jetty" } +joda-time = { module = "joda-time:joda-time", version.ref = "joda" } junit-jupiter = { module = "org.junit.jupiter:junit-jupiter", version.ref = "junit" } junit-jupiter-engine = { module = "org.junit.jupiter:junit-jupiter-engine", version.ref = "junit" } junit-pioneer = { module = "org.junit-pioneer:junit-pioneer", version.ref = "junit-pioneer" } From 680d850e9e2069919b6ed1207772b53dddcad963 Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Wed, 6 May 2026 11:17:27 +0200 Subject: [PATCH 161/197] Flink: Do not ship optional flink-metrics-dropwizard dependency (#16155) --- docs/docs/flink-writes.md | 6 +- flink/v2.1/build.gradle | 3 - flink/v2.1/flink-runtime/LICENSE | 16 --- flink/v2.1/flink-runtime/runtime-deps.txt | 2 - .../sink/IcebergStreamWriterMetrics.java | 109 ++++++++++++++---- .../sink/TestIcebergStreamWriterMetrics.java | 42 +++++++ 6 files changed, 130 insertions(+), 48 deletions(-) create mode 100644 flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java diff --git a/docs/docs/flink-writes.md b/docs/docs/flink-writes.md index c904635d0dbb..03795b5beed0 100644 --- a/docs/docs/flink-writes.md +++ b/docs/docs/flink-writes.md @@ -207,9 +207,9 @@ They should have the following key-value tags. | dataFilesSizeHistogram | Histogram | Histogram distribution of data file sizes (in bytes). | | deleteFilesSizeHistogram | Histogram | Histogram distribution of delete file sizes (in bytes). | -The `Histogram` metrics above require `flink-metrics-dropwizard` on the classpath, which is not shipped -by Flink by default. When using `iceberg-flink-runtime`, this dependency is already bundled. When using -the `iceberg-flink` artifact directly, add `org.apache.flink:flink-metrics-dropwizard` as a dependency. +The `Histogram` metrics above require `org.apache.flink:flink-metrics-dropwizard` on the classpath, +which is not shipped by Flink by default. Please add this artifact to your classpath to see histogram metrics. +If not present, histogram metrics will be missing. All other metric types will continue to get published. Committer metrics are added under the sub group of `IcebergFilesCommitter`. They should have the following key-value tags. diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index 9437189305a2..451f14414772 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -170,9 +170,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // To support dropwizard histogram metrics (not shipped by Flink by default) - implementation libs.flink21.metrics.dropwizard - // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') diff --git a/flink/v2.1/flink-runtime/LICENSE b/flink/v2.1/flink-runtime/LICENSE index 11460c3307c8..364652a5aca2 100644 --- a/flink/v2.1/flink-runtime/LICENSE +++ b/flink/v2.1/flink-runtime/LICENSE @@ -460,22 +460,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Dropwizard Metrics. - -Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team -Project URL: https://github.com/dropwizard/metrics -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Apache Flink's optional support for Dropwizard Metrics. - -Copyright: 2014-2026 The Apache Software Foundation -Project URL: https://flink.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors diff --git a/flink/v2.1/flink-runtime/runtime-deps.txt b/flink/v2.1/flink-runtime/runtime-deps.txt index 3dfc56f15ea9..00c53ed388d0 100644 --- a/flink/v2.1/flink-runtime/runtime-deps.txt +++ b/flink/v2.1/flink-runtime/runtime-deps.txt @@ -6,11 +6,9 @@ com.github.luben:zstd-jni:1.5.7-3 com.google.errorprone:error_prone_annotations:2.10.0 dev.failsafe:failsafe:3.3.2 io.airlift:aircompressor:2.0.3 -io.dropwizard.metrics:metrics-core:3.2.6 org.apache.avro:avro:1.12.1 org.apache.datasketches:datasketches-java:6.2.0 org.apache.datasketches:datasketches-memory:3.0.2 -org.apache.flink:flink-metrics-dropwizard:2.1.0 org.apache.httpcomponents.client5:httpclient5:5.6 org.apache.httpcomponents.core5:httpcore5-h2:5.4 org.apache.httpcomponents.core5:httpcore5:5.4 diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java index 434f3969577f..6cf15ff713fb 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -18,23 +18,33 @@ */ package org.apache.iceberg.flink.sink; -import com.codahale.metrics.SlidingWindowReservoir; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; import org.apache.flink.annotation.Internal; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.Histogram; import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.common.DynClasses; +import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.WriteResult; import org.apache.iceberg.util.ScanTaskUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @Internal public class IcebergStreamWriterMetrics { + + private static final Logger LOG = LoggerFactory.getLogger(IcebergStreamWriterMetrics.class); + // 1,024 reservoir size should cost about 8KB, which is quite small. // It should also produce good accuracy for histogram distribution (like percentiles). private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + // Histogram metrics loaded through Flink's optional flink-metrics-dropwizard dependency. + // Will be null if not available. + private static final DropwizardCtors DROPWIZARD = loadDropwizardCtors(); + private final Counter flushedDataFiles; private final Counter flushedDeleteFiles; private final Counter flushedReferencedDataFiles; @@ -51,18 +61,8 @@ public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { this.lastFlushDurationMs = new AtomicLong(); writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + this.dataFilesSizeHistogram = registerHistogram(writerMetrics, "dataFilesSizeHistogram"); + this.deleteFilesSizeHistogram = registerHistogram(writerMetrics, "deleteFilesSizeHistogram"); } public void updateFlushResult(WriteResult result) { @@ -74,16 +74,21 @@ public void updateFlushResult(WriteResult result) { // This should works equally well and we avoided the overhead of tracking the list of file sizes // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); - }); + if (dataFilesSizeHistogram != null) { + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + } + + if (deleteFilesSizeHistogram != null) { + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); + }); + } } public void flushDuration(long flushDurationMs) { @@ -97,4 +102,60 @@ public Counter getFlushedDataFiles() { public Counter getFlushedDeleteFiles() { return flushedDeleteFiles; } + + @VisibleForTesting + Histogram dataFilesSizeHistogram() { + return dataFilesSizeHistogram; + } + + @VisibleForTesting + Histogram deleteFilesSizeHistogram() { + return deleteFilesSizeHistogram; + } + + private static Histogram registerHistogram(MetricGroup group, String name) { + Histogram histogram = newDropwizardHistogram(); + return histogram != null ? group.histogram(name, histogram) : null; + } + + private static Histogram newDropwizardHistogram() { + if (DROPWIZARD == null) { + return null; + } + + Object reservoir = DROPWIZARD.reservoirCtor.newInstance(HISTOGRAM_RESERVOIR_SIZE); + Object codahaleHistogram = DROPWIZARD.histogramCtor.newInstance(reservoir); + return DROPWIZARD.wrapperCtor.newInstance(codahaleHistogram); + } + + private static DropwizardCtors loadDropwizardCtors() { + try { + Class reservoirInterface = + DynClasses.builder().impl("com.codahale.metrics.Reservoir").buildChecked(); + Class codahaleHistogramClass = + DynClasses.builder().impl("com.codahale.metrics.Histogram").buildChecked(); + return new DropwizardCtors( + DynConstructors.builder() + .impl("com.codahale.metrics.SlidingWindowReservoir", int.class) + .buildChecked(), + DynConstructors.builder() + .impl("com.codahale.metrics.Histogram", reservoirInterface) + .buildChecked(), + DynConstructors.builder(Histogram.class) + .impl( + "org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper", + codahaleHistogramClass) + .buildChecked()); + } catch (ClassNotFoundException | NoSuchMethodException e) { + LOG.warn( + "Cannot load Dropwizard metrics; is org.apache.flink:flink-metrics-dropwizard on the classpath?", + e); + return null; + } + } + + private record DropwizardCtors( + DynConstructors.Ctor reservoirCtor, + DynConstructors.Ctor histogramCtor, + DynConstructors.Ctor wrapperCtor) {} } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..42bbfc0d3628 --- /dev/null +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.io.WriteResult; +import org.junit.jupiter.api.Test; + +public class TestIcebergStreamWriterMetrics { + + @Test + void histogramsCreatedWhenDropwizardAvailable() { + IcebergStreamWriterMetrics metrics = + new IcebergStreamWriterMetrics( + UnregisteredMetricsGroup.createSinkWriterMetricGroup(), "db.table"); + + assertThat(metrics.dataFilesSizeHistogram()).isNotNull(); + assertThat(metrics.deleteFilesSizeHistogram()).isNotNull(); + + assertThatNoException() + .isThrownBy(() -> metrics.updateFlushResult(WriteResult.builder().build())); + } +} From 0bae0503bcec99e5b725da18430458f38749888c Mon Sep 17 00:00:00 2001 From: Maximilian Michels Date: Wed, 6 May 2026 11:22:23 +0200 Subject: [PATCH 162/197] Build: Correct actions/labeler version comment to v6.0.1 (#16225) --- .github/workflows/labeler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 16aac23a5683..3735367053ce 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -28,6 +28,6 @@ jobs: triage: runs-on: ubuntu-slim steps: - - uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6 + - uses: actions/labeler@634933edcd8ababfe52f92936142cc22ac488b1b # v6.0.1 with: sync-labels: true From ef077f45881d7700160eedfd4c129e64bd3aa93c Mon Sep 17 00:00:00 2001 From: sagib-sqream <135829055+sagib1@users.noreply.github.com> Date: Wed, 6 May 2026 17:49:17 +0300 Subject: [PATCH 163/197] Core: Fix JdbcCatalog & InMemoryCatalog to prevent dropping parent namespaces with children (#16061) * Fix for issue #16060 * formatting * formatting * CR fix * Enforce child namespaces scan also on InMemoryCatalog * empry commit for triggering failed CI again (failed on zizmor job) * CR requirements --- .../iceberg/inmemory/InMemoryCatalog.java | 7 ++++ .../org/apache/iceberg/jdbc/JdbcCatalog.java | 7 ++++ .../apache/iceberg/catalog/CatalogTests.java | 38 +++++++++++++++++++ .../iceberg/inmemory/TestInMemoryCatalog.java | 5 +++ .../apache/iceberg/jdbc/TestJdbcCatalog.java | 4 +- 5 files changed, 59 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java b/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java index 55c982f3d625..2234d418de40 100644 --- a/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java +++ b/core/src/main/java/org/apache/iceberg/inmemory/InMemoryCatalog.java @@ -219,6 +219,13 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept return false; } + List childNamespaces = listNamespaces(namespace); + if (!childNamespaces.isEmpty()) { + throw new NamespaceNotEmptyException( + "Namespace %s is not empty. Contains %d child namespace(s).", + namespace, childNamespaces.size()); + } + List tableIdentifiers = listTables(namespace); if (!tableIdentifiers.isEmpty()) { throw new NamespaceNotEmptyException( diff --git a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java index 007821da39fe..2d24e5598ac7 100644 --- a/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java +++ b/core/src/main/java/org/apache/iceberg/jdbc/JdbcCatalog.java @@ -543,6 +543,13 @@ public boolean dropNamespace(Namespace namespace) throws NamespaceNotEmptyExcept return false; } + List childNamespaces = listNamespaces(namespace); + if (childNamespaces != null && !childNamespaces.isEmpty()) { + throw new NamespaceNotEmptyException( + "Namespace %s is not empty. Contains %d child namespace(s).", + namespace, childNamespaces.size()); + } + List tableIdentifiers = listTables(namespace); if (tableIdentifiers != null && !tableIdentifiers.isEmpty()) { throw new NamespaceNotEmptyException( diff --git a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java index 9053f21ea112..8997cf15a08c 100644 --- a/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java +++ b/core/src/test/java/org/apache/iceberg/catalog/CatalogTests.java @@ -431,6 +431,44 @@ public void testDropNonEmptyNamespace() { assertThat(catalog.namespaceExists(NS)).as("Namespace should not exist").isFalse(); } + @Test + public void testDropNamespaceWithNestedNamespace() { + assumeThat(supportsNestedNamespaces()) + .as("Only valid when the catalog supports nested namespaces") + .isTrue(); + + C catalog = catalog(); + + Namespace parent = Namespace.of("parent"); + Namespace nested = Namespace.of("parent", "child"); + + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should not exist").isFalse(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should not exist").isFalse(); + + catalog.createNamespace(parent); + catalog.createNamespace(nested); + + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should exist").isTrue(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should exist").isTrue(); + + assertThatThrownBy(() -> catalog.dropNamespace(parent)) + .isInstanceOf(NamespaceNotEmptyException.class) + .hasMessageContaining("is not empty"); + + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should still exist").isTrue(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should still exist").isTrue(); + + assertThat(catalog.dropNamespace(nested)) + .as("Dropping an existing nested namespace should return true") + .isTrue(); + assertThat(catalog.namespaceExists(nested)).as("Nested namespace should not exist").isFalse(); + + assertThat(catalog.dropNamespace(parent)) + .as("Dropping an existing namespace should return true") + .isTrue(); + assertThat(catalog.namespaceExists(parent)).as("Parent namespace should not exist").isFalse(); + } + @Test public void testListNamespaces() { C catalog = catalog(); diff --git a/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java b/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java index c2c683e7d882..827450d4a398 100644 --- a/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java +++ b/core/src/test/java/org/apache/iceberg/inmemory/TestInMemoryCatalog.java @@ -82,6 +82,11 @@ protected boolean supportsEmptyNamespace() { return true; } + @Override + protected boolean supportsNestedNamespaces() { + return true; + } + @Test @Override public void testLoadTableWithMissingMetadataFile(@TempDir Path tempDir) throws IOException { diff --git a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java index 310d918849f3..ff0af5c56306 100644 --- a/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java +++ b/core/src/test/java/org/apache/iceberg/jdbc/TestJdbcCatalog.java @@ -852,11 +852,11 @@ public void testDropNamespace() { assertThatThrownBy(() -> catalog.dropNamespace(tbl2.namespace())) .isInstanceOf(NamespaceNotEmptyException.class) - .hasMessage("Namespace db.ns1 is not empty. Contains 1 table(s)."); + .hasMessage("Namespace db.ns1 is not empty. Contains 1 child namespace(s)."); assertThatThrownBy(() -> catalog.dropNamespace(tbl4.namespace())) .isInstanceOf(NamespaceNotEmptyException.class) - .hasMessage("Namespace db is not empty. Contains 1 table(s)."); + .hasMessage("Namespace db is not empty. Contains 2 child namespace(s)."); } @Test From b84b37f430b6c6e6b80e0a11d16c4d842b9da2a0 Mon Sep 17 00:00:00 2001 From: Hongyue/Steve Zhang Date: Wed, 6 May 2026 09:09:39 -0700 Subject: [PATCH 164/197] Core: Replace string-based schema projection with selection on field-id (#16184) --- .../apache/iceberg/FileCleanupStrategy.java | 19 +-- .../org/apache/iceberg/ManifestReader.java | 10 +- .../org/apache/iceberg/PartitionsTable.java | 127 +++++++++++------- 3 files changed, 95 insertions(+), 61 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java index dd92d33cda79..573aef057ff6 100644 --- a/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java +++ b/core/src/main/java/org/apache/iceberg/FileCleanupStrategy.java @@ -26,7 +26,9 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.SupportsBulkOperations; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.Tasks; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -78,14 +80,15 @@ public abstract void cleanFiles( ExpireSnapshots.CleanupLevel cleanupLevel); private static final Schema MANIFEST_PROJECTION = - ManifestFile.schema() - .select( - "manifest_path", - "manifest_length", - "partition_spec_id", - "added_snapshot_id", - "added_files_count", - "deleted_files_count"); + TypeUtil.select( + ManifestFile.schema(), + ImmutableSet.of( + ManifestFile.PATH.fieldId(), + ManifestFile.LENGTH.fieldId(), + ManifestFile.SPEC_ID.fieldId(), + ManifestFile.SNAPSHOT_ID.fieldId(), + ManifestFile.ADDED_FILES_COUNT.fieldId(), + ManifestFile.DELETED_FILES_COUNT.fieldId())); protected CloseableIterable readManifests(Snapshot snapshot) { if (snapshot.manifestListLocation() != null) { diff --git a/core/src/main/java/org/apache/iceberg/ManifestReader.java b/core/src/main/java/org/apache/iceberg/ManifestReader.java index 668a3764de1d..09bbe8b0cc6b 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestReader.java +++ b/core/src/main/java/org/apache/iceberg/ManifestReader.java @@ -43,6 +43,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.PartitionSet; import org.slf4j.Logger; @@ -68,6 +69,11 @@ public class ManifestReader> extends CloseableGroup "upper_bounds", "record_count"); + private static final Schema STATUS_ONLY_PROJECTION = + TypeUtil.select( + ManifestEntry.getSchema(Types.StructType.of()), + ImmutableSet.of(ManifestEntry.STATUS.fieldId())); + protected enum FileType { DATA_FILES(GenericDataFile.class), DELETE_FILES(GenericDeleteFile.class); @@ -157,9 +163,7 @@ private static > Map readMetadata(Input Map metadata; try { try (CloseableIterable> headerReader = - InternalData.read(FileFormat.AVRO, inputFile) - .project(ManifestEntry.getSchema(Types.StructType.of()).select("status")) - .build()) { + InternalData.read(FileFormat.AVRO, inputFile).project(STATUS_ONLY_PROJECTION).build()) { if (headerReader instanceof AvroIterable) { metadata = ((AvroIterable>) headerReader).getMetadata(); diff --git a/core/src/main/java/org/apache/iceberg/PartitionsTable.java b/core/src/main/java/org/apache/iceberg/PartitionsTable.java index 09c6e7893b7e..10366db5a55d 100644 --- a/core/src/main/java/org/apache/iceberg/PartitionsTable.java +++ b/core/src/main/java/org/apache/iceberg/PartitionsTable.java @@ -27,7 +27,9 @@ import org.apache.iceberg.expressions.ManifestEvaluator; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.types.Comparators; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ParallelIterable; import org.apache.iceberg.util.PartitionUtil; @@ -37,6 +39,58 @@ /** A {@link Table} implementation that exposes a table's partitions as rows. */ public class PartitionsTable extends BaseMetadataTable { + private static final int PARTITION_FIELD_ID = 1; + + private static final Types.NestedField SPEC_ID = + Types.NestedField.required(4, "spec_id", Types.IntegerType.get()); + private static final Types.NestedField RECORD_COUNT = + Types.NestedField.required( + 2, "record_count", Types.LongType.get(), "Count of records in data files"); + private static final Types.NestedField FILE_COUNT = + Types.NestedField.required(3, "file_count", Types.IntegerType.get(), "Count of data files"); + private static final Types.NestedField TOTAL_DATA_FILE_SIZE_IN_BYTES = + Types.NestedField.required( + 11, + "total_data_file_size_in_bytes", + Types.LongType.get(), + "Total size in bytes of data files"); + private static final Types.NestedField POSITION_DELETE_RECORD_COUNT = + Types.NestedField.required( + 5, + "position_delete_record_count", + Types.LongType.get(), + "Count of records in position delete files"); + private static final Types.NestedField POSITION_DELETE_FILE_COUNT = + Types.NestedField.required( + 6, + "position_delete_file_count", + Types.IntegerType.get(), + "Count of position delete files"); + private static final Types.NestedField EQUALITY_DELETE_RECORD_COUNT = + Types.NestedField.required( + 7, + "equality_delete_record_count", + Types.LongType.get(), + "Count of records in equality delete files"); + private static final Types.NestedField EQUALITY_DELETE_FILE_COUNT = + Types.NestedField.required( + 8, + "equality_delete_file_count", + Types.IntegerType.get(), + "Count of equality delete files"); + private static final Types.NestedField LAST_UPDATED_AT = + Types.NestedField.optional( + 9, + "last_updated_at", + Types.TimestampType.withZone(), + "Commit time of snapshot that last updated this partition"); + private static final Types.NestedField LAST_UPDATED_SNAPSHOT_ID = + Types.NestedField.optional( + 10, + "last_updated_snapshot_id", + Types.LongType.get(), + "Id of snapshot that last updated this partition"); + private final Schema schema; private final boolean unpartitionedTable; @@ -50,47 +104,18 @@ public class PartitionsTable extends BaseMetadataTable { this.schema = new Schema( - Types.NestedField.required(1, "partition", Partitioning.partitionType(table)), - Types.NestedField.required(4, "spec_id", Types.IntegerType.get()), - Types.NestedField.required( - 2, "record_count", Types.LongType.get(), "Count of records in data files"), - Types.NestedField.required( - 3, "file_count", Types.IntegerType.get(), "Count of data files"), - Types.NestedField.required( - 11, - "total_data_file_size_in_bytes", - Types.LongType.get(), - "Total size in bytes of data files"), Types.NestedField.required( - 5, - "position_delete_record_count", - Types.LongType.get(), - "Count of records in position delete files"), - Types.NestedField.required( - 6, - "position_delete_file_count", - Types.IntegerType.get(), - "Count of position delete files"), - Types.NestedField.required( - 7, - "equality_delete_record_count", - Types.LongType.get(), - "Count of records in equality delete files"), - Types.NestedField.required( - 8, - "equality_delete_file_count", - Types.IntegerType.get(), - "Count of equality delete files"), - Types.NestedField.optional( - 9, - "last_updated_at", - Types.TimestampType.withZone(), - "Commit time of snapshot that last updated this partition"), - Types.NestedField.optional( - 10, - "last_updated_snapshot_id", - Types.LongType.get(), - "Id of snapshot that last updated this partition")); + PARTITION_FIELD_ID, "partition", Partitioning.partitionType(table)), + SPEC_ID, + RECORD_COUNT, + FILE_COUNT, + TOTAL_DATA_FILE_SIZE_IN_BYTES, + POSITION_DELETE_RECORD_COUNT, + POSITION_DELETE_FILE_COUNT, + EQUALITY_DELETE_RECORD_COUNT, + EQUALITY_DELETE_FILE_COUNT, + LAST_UPDATED_AT, + LAST_UPDATED_SNAPSHOT_ID); this.unpartitionedTable = Partitioning.partitionType(table).fields().isEmpty(); } @@ -102,16 +127,18 @@ public TableScan newScan() { @Override public Schema schema() { if (unpartitionedTable) { - return schema.select( - "record_count", - "file_count", - "total_data_file_size_in_bytes", - "position_delete_record_count", - "position_delete_file_count", - "equality_delete_record_count", - "equality_delete_file_count", - "last_updated_at", - "last_updated_snapshot_id"); + return TypeUtil.select( + schema, + ImmutableSet.of( + RECORD_COUNT.fieldId(), + FILE_COUNT.fieldId(), + TOTAL_DATA_FILE_SIZE_IN_BYTES.fieldId(), + POSITION_DELETE_RECORD_COUNT.fieldId(), + POSITION_DELETE_FILE_COUNT.fieldId(), + EQUALITY_DELETE_RECORD_COUNT.fieldId(), + EQUALITY_DELETE_FILE_COUNT.fieldId(), + LAST_UPDATED_AT.fieldId(), + LAST_UPDATED_SNAPSHOT_ID.fieldId())); } return schema; } From d7cb7994510c08d2b55352195773763d93243d9c Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 6 May 2026 12:32:43 -0400 Subject: [PATCH 165/197] Flink: Backport removal of optional flink-metrics-dropwizard dependency to v2.0 and v1.20 (#16230) --- flink/v1.20/build.gradle | 3 - flink/v1.20/flink-runtime/LICENSE | 16 --- flink/v1.20/flink-runtime/runtime-deps.txt | 2 - .../sink/IcebergStreamWriterMetrics.java | 109 ++++++++++++++---- .../sink/TestIcebergStreamWriterMetrics.java | 42 +++++++ flink/v2.0/build.gradle | 3 - flink/v2.0/flink-runtime/LICENSE | 16 --- flink/v2.0/flink-runtime/runtime-deps.txt | 2 - .../sink/IcebergStreamWriterMetrics.java | 109 ++++++++++++++---- .../sink/TestIcebergStreamWriterMetrics.java | 42 +++++++ 10 files changed, 254 insertions(+), 90 deletions(-) create mode 100644 flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java create mode 100644 flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index 2bbad1891c81..c7ca24817bc9 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -169,9 +169,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // To support dropwizard histogram metrics (not shipped by Flink by default) - implementation libs.flink120.metrics.dropwizard - // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') diff --git a/flink/v1.20/flink-runtime/LICENSE b/flink/v1.20/flink-runtime/LICENSE index 11460c3307c8..364652a5aca2 100644 --- a/flink/v1.20/flink-runtime/LICENSE +++ b/flink/v1.20/flink-runtime/LICENSE @@ -460,22 +460,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Dropwizard Metrics. - -Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team -Project URL: https://github.com/dropwizard/metrics -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Apache Flink's optional support for Dropwizard Metrics. - -Copyright: 2014-2026 The Apache Software Foundation -Project URL: https://flink.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors diff --git a/flink/v1.20/flink-runtime/runtime-deps.txt b/flink/v1.20/flink-runtime/runtime-deps.txt index 7c7aed1e4357..00c53ed388d0 100644 --- a/flink/v1.20/flink-runtime/runtime-deps.txt +++ b/flink/v1.20/flink-runtime/runtime-deps.txt @@ -6,11 +6,9 @@ com.github.luben:zstd-jni:1.5.7-3 com.google.errorprone:error_prone_annotations:2.10.0 dev.failsafe:failsafe:3.3.2 io.airlift:aircompressor:2.0.3 -io.dropwizard.metrics:metrics-core:3.2.6 org.apache.avro:avro:1.12.1 org.apache.datasketches:datasketches-java:6.2.0 org.apache.datasketches:datasketches-memory:3.0.2 -org.apache.flink:flink-metrics-dropwizard:1.20.1 org.apache.httpcomponents.client5:httpclient5:5.6 org.apache.httpcomponents.core5:httpcore5-h2:5.4 org.apache.httpcomponents.core5:httpcore5:5.4 diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java index 434f3969577f..6cf15ff713fb 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -18,23 +18,33 @@ */ package org.apache.iceberg.flink.sink; -import com.codahale.metrics.SlidingWindowReservoir; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; import org.apache.flink.annotation.Internal; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.Histogram; import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.common.DynClasses; +import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.WriteResult; import org.apache.iceberg.util.ScanTaskUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @Internal public class IcebergStreamWriterMetrics { + + private static final Logger LOG = LoggerFactory.getLogger(IcebergStreamWriterMetrics.class); + // 1,024 reservoir size should cost about 8KB, which is quite small. // It should also produce good accuracy for histogram distribution (like percentiles). private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + // Histogram metrics loaded through Flink's optional flink-metrics-dropwizard dependency. + // Will be null if not available. + private static final DropwizardCtors DROPWIZARD = loadDropwizardCtors(); + private final Counter flushedDataFiles; private final Counter flushedDeleteFiles; private final Counter flushedReferencedDataFiles; @@ -51,18 +61,8 @@ public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { this.lastFlushDurationMs = new AtomicLong(); writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + this.dataFilesSizeHistogram = registerHistogram(writerMetrics, "dataFilesSizeHistogram"); + this.deleteFilesSizeHistogram = registerHistogram(writerMetrics, "deleteFilesSizeHistogram"); } public void updateFlushResult(WriteResult result) { @@ -74,16 +74,21 @@ public void updateFlushResult(WriteResult result) { // This should works equally well and we avoided the overhead of tracking the list of file sizes // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); - }); + if (dataFilesSizeHistogram != null) { + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + } + + if (deleteFilesSizeHistogram != null) { + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); + }); + } } public void flushDuration(long flushDurationMs) { @@ -97,4 +102,60 @@ public Counter getFlushedDataFiles() { public Counter getFlushedDeleteFiles() { return flushedDeleteFiles; } + + @VisibleForTesting + Histogram dataFilesSizeHistogram() { + return dataFilesSizeHistogram; + } + + @VisibleForTesting + Histogram deleteFilesSizeHistogram() { + return deleteFilesSizeHistogram; + } + + private static Histogram registerHistogram(MetricGroup group, String name) { + Histogram histogram = newDropwizardHistogram(); + return histogram != null ? group.histogram(name, histogram) : null; + } + + private static Histogram newDropwizardHistogram() { + if (DROPWIZARD == null) { + return null; + } + + Object reservoir = DROPWIZARD.reservoirCtor.newInstance(HISTOGRAM_RESERVOIR_SIZE); + Object codahaleHistogram = DROPWIZARD.histogramCtor.newInstance(reservoir); + return DROPWIZARD.wrapperCtor.newInstance(codahaleHistogram); + } + + private static DropwizardCtors loadDropwizardCtors() { + try { + Class reservoirInterface = + DynClasses.builder().impl("com.codahale.metrics.Reservoir").buildChecked(); + Class codahaleHistogramClass = + DynClasses.builder().impl("com.codahale.metrics.Histogram").buildChecked(); + return new DropwizardCtors( + DynConstructors.builder() + .impl("com.codahale.metrics.SlidingWindowReservoir", int.class) + .buildChecked(), + DynConstructors.builder() + .impl("com.codahale.metrics.Histogram", reservoirInterface) + .buildChecked(), + DynConstructors.builder(Histogram.class) + .impl( + "org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper", + codahaleHistogramClass) + .buildChecked()); + } catch (ClassNotFoundException | NoSuchMethodException e) { + LOG.warn( + "Cannot load Dropwizard metrics; is org.apache.flink:flink-metrics-dropwizard on the classpath?", + e); + return null; + } + } + + private record DropwizardCtors( + DynConstructors.Ctor reservoirCtor, + DynConstructors.Ctor histogramCtor, + DynConstructors.Ctor wrapperCtor) {} } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..42bbfc0d3628 --- /dev/null +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.io.WriteResult; +import org.junit.jupiter.api.Test; + +public class TestIcebergStreamWriterMetrics { + + @Test + void histogramsCreatedWhenDropwizardAvailable() { + IcebergStreamWriterMetrics metrics = + new IcebergStreamWriterMetrics( + UnregisteredMetricsGroup.createSinkWriterMetricGroup(), "db.table"); + + assertThat(metrics.dataFilesSizeHistogram()).isNotNull(); + assertThat(metrics.deleteFilesSizeHistogram()).isNotNull(); + + assertThatNoException() + .isThrownBy(() -> metrics.updateFlushResult(WriteResult.builder().build())); + } +} diff --git a/flink/v2.0/build.gradle b/flink/v2.0/build.gradle index 626cc01b28e3..94f851e03221 100644 --- a/flink/v2.0/build.gradle +++ b/flink/v2.0/build.gradle @@ -169,9 +169,6 @@ project(":iceberg-flink:iceberg-flink-runtime-${flinkMajorVersion}") { exclude group: 'com.google.code.findbugs', module: 'jsr305' } - // To support dropwizard histogram metrics (not shipped by Flink by default) - implementation libs.flink20.metrics.dropwizard - // for integration testing with the flink-runtime-jar // all of those dependencies are required because the integration test extends FlinkTestBase integrationCompileOnly project(':iceberg-api') diff --git a/flink/v2.0/flink-runtime/LICENSE b/flink/v2.0/flink-runtime/LICENSE index 11460c3307c8..364652a5aca2 100644 --- a/flink/v2.0/flink-runtime/LICENSE +++ b/flink/v2.0/flink-runtime/LICENSE @@ -460,22 +460,6 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Dropwizard Metrics. - -Copyright: (c) 2010-2013 Coda Hale, Yammer.com, 2014-2021 Dropwizard Team -Project URL: https://github.com/dropwizard/metrics -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - -This product bundles Apache Flink's optional support for Dropwizard Metrics. - -Copyright: 2014-2026 The Apache Software Foundation -Project URL: https://flink.apache.org/ -License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - --------------------------------------------------------------------------------- - This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors diff --git a/flink/v2.0/flink-runtime/runtime-deps.txt b/flink/v2.0/flink-runtime/runtime-deps.txt index c70e3fbba92c..00c53ed388d0 100644 --- a/flink/v2.0/flink-runtime/runtime-deps.txt +++ b/flink/v2.0/flink-runtime/runtime-deps.txt @@ -6,11 +6,9 @@ com.github.luben:zstd-jni:1.5.7-3 com.google.errorprone:error_prone_annotations:2.10.0 dev.failsafe:failsafe:3.3.2 io.airlift:aircompressor:2.0.3 -io.dropwizard.metrics:metrics-core:3.2.6 org.apache.avro:avro:1.12.1 org.apache.datasketches:datasketches-java:6.2.0 org.apache.datasketches:datasketches-memory:3.0.2 -org.apache.flink:flink-metrics-dropwizard:2.0.0 org.apache.httpcomponents.client5:httpclient5:5.6 org.apache.httpcomponents.core5:httpcore5-h2:5.4 org.apache.httpcomponents.core5:httpcore5:5.4 diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java index 434f3969577f..6cf15ff713fb 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/IcebergStreamWriterMetrics.java @@ -18,23 +18,33 @@ */ package org.apache.iceberg.flink.sink; -import com.codahale.metrics.SlidingWindowReservoir; import java.util.Arrays; import java.util.concurrent.atomic.AtomicLong; import org.apache.flink.annotation.Internal; -import org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper; +import org.apache.flink.annotation.VisibleForTesting; import org.apache.flink.metrics.Counter; import org.apache.flink.metrics.Histogram; import org.apache.flink.metrics.MetricGroup; +import org.apache.iceberg.common.DynClasses; +import org.apache.iceberg.common.DynConstructors; import org.apache.iceberg.io.WriteResult; import org.apache.iceberg.util.ScanTaskUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; @Internal public class IcebergStreamWriterMetrics { + + private static final Logger LOG = LoggerFactory.getLogger(IcebergStreamWriterMetrics.class); + // 1,024 reservoir size should cost about 8KB, which is quite small. // It should also produce good accuracy for histogram distribution (like percentiles). private static final int HISTOGRAM_RESERVOIR_SIZE = 1024; + // Histogram metrics loaded through Flink's optional flink-metrics-dropwizard dependency. + // Will be null if not available. + private static final DropwizardCtors DROPWIZARD = loadDropwizardCtors(); + private final Counter flushedDataFiles; private final Counter flushedDeleteFiles; private final Counter flushedReferencedDataFiles; @@ -51,18 +61,8 @@ public IcebergStreamWriterMetrics(MetricGroup metrics, String fullTableName) { this.lastFlushDurationMs = new AtomicLong(); writerMetrics.gauge("lastFlushDurationMs", lastFlushDurationMs::get); - com.codahale.metrics.Histogram dropwizardDataFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.dataFilesSizeHistogram = - writerMetrics.histogram( - "dataFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDataFilesSizeHistogram)); - com.codahale.metrics.Histogram dropwizardDeleteFilesSizeHistogram = - new com.codahale.metrics.Histogram(new SlidingWindowReservoir(HISTOGRAM_RESERVOIR_SIZE)); - this.deleteFilesSizeHistogram = - writerMetrics.histogram( - "deleteFilesSizeHistogram", - new DropwizardHistogramWrapper(dropwizardDeleteFilesSizeHistogram)); + this.dataFilesSizeHistogram = registerHistogram(writerMetrics, "dataFilesSizeHistogram"); + this.deleteFilesSizeHistogram = registerHistogram(writerMetrics, "deleteFilesSizeHistogram"); } public void updateFlushResult(WriteResult result) { @@ -74,16 +74,21 @@ public void updateFlushResult(WriteResult result) { // This should works equally well and we avoided the overhead of tracking the list of file sizes // in the {@link CommitSummary}, which currently stores simple stats for counters and gauges // metrics. - Arrays.stream(result.dataFiles()) - .forEach( - dataFile -> { - dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); - }); - Arrays.stream(result.deleteFiles()) - .forEach( - deleteFile -> { - deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); - }); + if (dataFilesSizeHistogram != null) { + Arrays.stream(result.dataFiles()) + .forEach( + dataFile -> { + dataFilesSizeHistogram.update(dataFile.fileSizeInBytes()); + }); + } + + if (deleteFilesSizeHistogram != null) { + Arrays.stream(result.deleteFiles()) + .forEach( + deleteFile -> { + deleteFilesSizeHistogram.update(ScanTaskUtil.contentSizeInBytes(deleteFile)); + }); + } } public void flushDuration(long flushDurationMs) { @@ -97,4 +102,60 @@ public Counter getFlushedDataFiles() { public Counter getFlushedDeleteFiles() { return flushedDeleteFiles; } + + @VisibleForTesting + Histogram dataFilesSizeHistogram() { + return dataFilesSizeHistogram; + } + + @VisibleForTesting + Histogram deleteFilesSizeHistogram() { + return deleteFilesSizeHistogram; + } + + private static Histogram registerHistogram(MetricGroup group, String name) { + Histogram histogram = newDropwizardHistogram(); + return histogram != null ? group.histogram(name, histogram) : null; + } + + private static Histogram newDropwizardHistogram() { + if (DROPWIZARD == null) { + return null; + } + + Object reservoir = DROPWIZARD.reservoirCtor.newInstance(HISTOGRAM_RESERVOIR_SIZE); + Object codahaleHistogram = DROPWIZARD.histogramCtor.newInstance(reservoir); + return DROPWIZARD.wrapperCtor.newInstance(codahaleHistogram); + } + + private static DropwizardCtors loadDropwizardCtors() { + try { + Class reservoirInterface = + DynClasses.builder().impl("com.codahale.metrics.Reservoir").buildChecked(); + Class codahaleHistogramClass = + DynClasses.builder().impl("com.codahale.metrics.Histogram").buildChecked(); + return new DropwizardCtors( + DynConstructors.builder() + .impl("com.codahale.metrics.SlidingWindowReservoir", int.class) + .buildChecked(), + DynConstructors.builder() + .impl("com.codahale.metrics.Histogram", reservoirInterface) + .buildChecked(), + DynConstructors.builder(Histogram.class) + .impl( + "org.apache.flink.dropwizard.metrics.DropwizardHistogramWrapper", + codahaleHistogramClass) + .buildChecked()); + } catch (ClassNotFoundException | NoSuchMethodException e) { + LOG.warn( + "Cannot load Dropwizard metrics; is org.apache.flink:flink-metrics-dropwizard on the classpath?", + e); + return null; + } + } + + private record DropwizardCtors( + DynConstructors.Ctor reservoirCtor, + DynConstructors.Ctor histogramCtor, + DynConstructors.Ctor wrapperCtor) {} } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java new file mode 100644 index 000000000000..42bbfc0d3628 --- /dev/null +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriterMetrics.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.sink; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNoException; + +import org.apache.flink.metrics.groups.UnregisteredMetricsGroup; +import org.apache.iceberg.io.WriteResult; +import org.junit.jupiter.api.Test; + +public class TestIcebergStreamWriterMetrics { + + @Test + void histogramsCreatedWhenDropwizardAvailable() { + IcebergStreamWriterMetrics metrics = + new IcebergStreamWriterMetrics( + UnregisteredMetricsGroup.createSinkWriterMetricGroup(), "db.table"); + + assertThat(metrics.dataFilesSizeHistogram()).isNotNull(); + assertThat(metrics.deleteFilesSizeHistogram()).isNotNull(); + + assertThatNoException() + .isThrownBy(() -> metrics.updateFlushResult(WriteResult.builder().build())); + } +} From 0d2707eaab3f1629bdcb55fdb5100fa256c83d19 Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Thu, 7 May 2026 00:41:29 +0800 Subject: [PATCH 166/197] Docs: Add missing v3 data types to status page (#16228) --- site/docs/status.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/site/docs/status.md b/site/docs/status.md index a7d2cfb38567..51d2f7271561 100644 --- a/site/docs/status.md +++ b/site/docs/status.md @@ -49,11 +49,14 @@ This section lists the libraries that implement the Apache Iceberg specification | timestamptz | Y | Y | Y | Y | Y | | timestamp_ns | Y | Y | Y | Y | N | | timestamptz_ns | Y | Y | Y | Y | N | +| unknown | Y | Y | N | Y | N | | string | Y | Y | Y | Y | Y | | uuid | Y | Y | Y | Y | N | | fixed | Y | Y | Y | Y | Y | | binary | Y | Y | Y | Y | Y | | variant | Y | Y | Y | Y | N | +| geometry | Y | N | N | N | N | +| geography | Y | N | N | N | N | | list | Y | Y | Y | Y | Y | | map | Y | Y | Y | Y | Y | | struct | Y | Y | Y | Y | Y | From b7ef9f1fa82ac1bcf57359e746128a706b2d232e Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 6 May 2026 13:02:43 -0400 Subject: [PATCH 167/197] CI: Use specific patch versions in workflow action comments (#16229) --- .../workflows/api-binary-compatibility.yml | 6 ++--- .github/workflows/asf-allowlist-check.yml | 2 +- .github/workflows/codeql.yml | 2 +- .github/workflows/delta-conversion-ci.yml | 12 +++++----- .github/workflows/docs-ci.yml | 4 ++-- .github/workflows/flink-ci.yml | 6 ++--- .github/workflows/hive-ci.yml | 6 ++--- .github/workflows/java-ci.yml | 24 +++++++++---------- .github/workflows/jmh-benchmarks.yml | 8 +++---- .github/workflows/kafka-connect-ci.yml | 6 ++--- .github/workflows/license-check.yml | 2 +- .github/workflows/open-api.yml | 2 +- .../publish-iceberg-rest-fixture-docker.yml | 6 ++--- .github/workflows/publish-snapshot.yml | 6 ++--- .../workflows/recurring-jmh-benchmarks.yml | 6 ++--- .github/workflows/site-ci.yml | 4 ++-- .github/workflows/spark-ci.yml | 6 ++--- 17 files changed, 54 insertions(+), 54 deletions(-) diff --git a/.github/workflows/api-binary-compatibility.yml b/.github/workflows/api-binary-compatibility.yml index 8ad0ebd26f0e..58a04c9427e0 100644 --- a/.github/workflows/api-binary-compatibility.yml +++ b/.github/workflows/api-binary-compatibility.yml @@ -46,7 +46,7 @@ jobs: revapi: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: # fetch-depth of zero ensures that the tags are pulled in and we're not in a detached HEAD state # revapi depends on the tags, specifically the tag from git describe, to find the relevant override @@ -55,11 +55,11 @@ jobs: # See https://github.com/actions/checkout/issues/124 fetch-depth: 0 persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: | echo "Using the old version tag, as per git describe, of $(git describe)"; - run: ./gradlew revapi --rerun-tasks diff --git a/.github/workflows/asf-allowlist-check.yml b/.github/workflows/asf-allowlist-check.yml index 65dbe8bcbee9..8d7952a9d29b 100644 --- a/.github/workflows/asf-allowlist-check.yml +++ b/.github/workflows/asf-allowlist-check.yml @@ -40,7 +40,7 @@ jobs: asf-allowlist-check: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - uses: apache/infrastructure-actions/allowlist-check@4e9c961f587f72b170874b6f5cd4ac15f7f26eb8 # main diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 7e9c8208c888..49212916a3f2 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -41,7 +41,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false diff --git a/.github/workflows/delta-conversion-ci.yml b/.github/workflows/delta-conversion-ci.yml index 82442ac041f4..a1fb7fea9fdf 100644 --- a/.github/workflows/delta-conversion-ci.yml +++ b/.github/workflows/delta-conversion-ci.yml @@ -80,14 +80,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.12 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -106,14 +106,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions=3.5 -DscalaVersion=2.13 -DkafkaVersions= -DflinkVersions= :iceberg-delta-lake:check -Pquick=true -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/docs-ci.yml b/.github/workflows/docs-ci.yml index ff6c6bdbd8cf..2bcda0bbc090 100644 --- a/.github/workflows/docs-ci.yml +++ b/.github/workflows/docs-ci.yml @@ -36,10 +36,10 @@ jobs: matrix: os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: 3.x - name: Build Iceberg documentation diff --git a/.github/workflows/flink-ci.yml b/.github/workflows/flink-ci.yml index 5479503179fc..a515a71fa3be 100644 --- a/.github/workflows/flink-ci.yml +++ b/.github/workflows/flink-ci.yml @@ -84,14 +84,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DkafkaVersions= -DflinkVersions=${{ matrix.flink }} :iceberg-flink:iceberg-flink-${{ matrix.flink }}:check :iceberg-flink:iceberg-flink-runtime-${{ matrix.flink }}:check -Pquick=true -x javadoc -DtestParallelism=auto - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/hive-ci.yml b/.github/workflows/hive-ci.yml index 084ce9f4fcf4..4853508b854b 100644 --- a/.github/workflows/hive-ci.yml +++ b/.github/workflows/hive-ci.yml @@ -81,14 +81,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true :iceberg-mr:check -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/java-ci.yml b/.github/workflows/java-ci.yml index 3d489c574ff7..670fd78fb64e 100644 --- a/.github/workflows/java-ci.yml +++ b/.github/workflows/java-ci.yml @@ -76,14 +76,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: ./gradlew check -DsparkVersions= -DflinkVersions= -DkafkaVersions= -Pquick=true -x javadoc - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -100,14 +100,14 @@ jobs: matrix: jvm: [17, 21] steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: ./gradlew -DallModules build -x test -x javadoc -x integrationTest build-javadoc: @@ -117,25 +117,25 @@ jobs: matrix: jvm: [17, 21] steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: ./gradlew -Pquick=true javadoc check-runtime-deps: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: ./gradlew checkAllRuntimeDeps -q -DallModules=true diff --git a/.github/workflows/jmh-benchmarks.yml b/.github/workflows/jmh-benchmarks.yml index 6dbd3a6958fd..e2c9522a757c 100644 --- a/.github/workflows/jmh-benchmarks.yml +++ b/.github/workflows/jmh-benchmarks.yml @@ -49,7 +49,7 @@ jobs: matrix: ${{ steps.set-matrix.outputs.matrix }} foundlabel: ${{ steps.set-matrix.outputs.foundlabel }} steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: ${{ github.event.inputs.repo }} ref: ${{ github.event.inputs.ref }} @@ -94,16 +94,16 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: repository: ${{ github.event.inputs.repo }} ref: ${{ github.event.inputs.ref }} persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - name: Run Benchmark diff --git a/.github/workflows/kafka-connect-ci.yml b/.github/workflows/kafka-connect-ci.yml index 7eaa042990ad..3b962aefbb02 100644 --- a/.github/workflows/kafka-connect-ci.yml +++ b/.github/workflows/kafka-connect-ci.yml @@ -81,14 +81,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - run: | ./gradlew -DsparkVersions= -DflinkVersions= -DkafkaVersions=3 \ diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml index ccd2a9a429f3..cc285eabe101 100644 --- a/.github/workflows/license-check.yml +++ b/.github/workflows/license-check.yml @@ -27,7 +27,7 @@ jobs: rat: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - run: | diff --git a/.github/workflows/open-api.yml b/.github/workflows/open-api.yml index fdc5bcda679e..28cd2ad89dfa 100644 --- a/.github/workflows/open-api.yml +++ b/.github/workflows/open-api.yml @@ -44,7 +44,7 @@ jobs: runs-on: ubuntu-slim steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - name: Install uv diff --git a/.github/workflows/publish-iceberg-rest-fixture-docker.yml b/.github/workflows/publish-iceberg-rest-fixture-docker.yml index fabc62399c08..264e402deaac 100644 --- a/.github/workflows/publish-iceberg-rest-fixture-docker.yml +++ b/.github/workflows/publish-iceberg-rest-fixture-docker.yml @@ -41,14 +41,14 @@ jobs: runs-on: ubuntu-latest environment: docker-publish steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 21 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - name: Build Iceberg Open API project run: ./gradlew :iceberg-open-api:shadowJar - name: Login to Docker Hub diff --git a/.github/workflows/publish-snapshot.yml b/.github/workflows/publish-snapshot.yml index 1cbe5c706279..a8557c44f32b 100644 --- a/.github/workflows/publish-snapshot.yml +++ b/.github/workflows/publish-snapshot.yml @@ -34,16 +34,16 @@ jobs: runs-on: ubuntu-24.04 environment: maven-publish steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: # we need to fetch all tags so that getProjectVersion() in build.gradle correctly determines the next SNAPSHOT version from the newest tag fetch-depth: 0 persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - env: NEXUS_USER: ${{ secrets.NEXUS_USER }} NEXUS_PW: ${{ secrets.NEXUS_PW }} diff --git a/.github/workflows/recurring-jmh-benchmarks.yml b/.github/workflows/recurring-jmh-benchmarks.yml index da2e7d60325c..88bb10566e43 100644 --- a/.github/workflows/recurring-jmh-benchmarks.yml +++ b/.github/workflows/recurring-jmh-benchmarks.yml @@ -51,14 +51,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: 17 - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - run: echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts - name: Run Benchmark diff --git a/.github/workflows/site-ci.yml b/.github/workflows/site-ci.yml index 6152d4970305..fbd18caeb6da 100644 --- a/.github/workflows/site-ci.yml +++ b/.github/workflows/site-ci.yml @@ -36,10 +36,10 @@ jobs: permissions: contents: write steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 + - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: 3.x - name: Deploy Iceberg documentation diff --git a/.github/workflows/spark-ci.yml b/.github/workflows/spark-ci.yml index e9d77308f580..a6e7b1504231 100644 --- a/.github/workflows/spark-ci.yml +++ b/.github/workflows/spark-ci.yml @@ -91,14 +91,14 @@ jobs: env: SPARK_LOCAL_IP: localhost steps: - - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: persist-credentials: false - - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5 + - uses: actions/setup-java@be666c2fcd27ec809703dec50e508c2fdc7f6654 # v5.2.0 with: distribution: zulu java-version: ${{ matrix.jvm }} - - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5 + - uses: gradle/actions/setup-gradle@0723195856401067f7a2779048b490ace7a47d7c # v5.0.2 - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 with: tool-cache: false From e2a119c822b65114ff34e4bc4967967927256603 Mon Sep 17 00:00:00 2001 From: Aihua Xu Date: Wed, 6 May 2026 13:14:15 -0700 Subject: [PATCH 168/197] Spark: Support writing shredded variant in Iceberg-Spark (#14297) * Spark shredded variant implementation * Add heuristics to determine the shredding schema * Simplify heuristics to most common type * Add to 4.1 * Add tie break and INT/DECIMAL promotion * Wire shredding writer through WriterFunction API * Fix decimal issue, null handling, heuristics and adding more tests * Adding BufferedFileAppender for deferred writer init * Adding VariantShreddingAnalyzer and withFileSchema support * Wiring the variant shredding write path via BufferedFileAppender * Fix checkstyle violations in SchemaInferenceVisitor and SparkFileWriterFactory * Wire variant shredding write path through FormatModel API as per PR feedback * Fix decimal overflow, array pruning, and buffer lifecycle in variant shredding * Test fix and pr comment * Fixing PR comments * Update doc for spark config * Core: Move DataTestHelpers to core and use in TestBufferedFileAppender Co-authored-by: Neelesh Salian Co-authored-by: Aihua Xu * Address reviewer feedback: decimal canWrite pre-check, analyzer javadoc string, decimal fallback tests * PR feedback for properties * PR comment typed value data --------- Co-authored-by: Neelesh Salian --- .../org/apache/iceberg/TableProperties.java | 6 + .../iceberg/io/BufferedFileAppender.java | 149 +++ .../apache/iceberg/data/DataTestHelpers.java | 0 .../iceberg/io/TestBufferedFileAppender.java | 227 ++++ docs/docs/configuration.md | 2 + docs/docs/spark-configuration.md | 4 + .../iceberg/parquet/ParquetFormatModel.java | 93 +- .../parquet/ParquetVariantWriters.java | 61 +- .../parquet/VariantShreddingAnalyzer.java | 532 ++++++++ .../iceberg/parquet/VariantWriterBuilder.java | 12 +- .../parquet/TestParquetDataWriter.java | 210 ++++ .../parquet/TestVariantShreddingAnalyzer.java | 475 +++++++ .../iceberg/spark/SparkSQLProperties.java | 8 + .../apache/iceberg/spark/SparkWriteConf.java | 30 + .../iceberg/spark/SparkWriteOptions.java | 6 + .../spark/source/SparkFormatModels.java | 4 +- .../source/SparkVariantShreddingAnalyzer.java | 69 ++ .../iceberg/spark/TestSparkWriteConf.java | 84 ++ .../spark/variant/TestVariantShredding.java | 1101 +++++++++++++++++ 19 files changed, 3061 insertions(+), 12 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java rename {data => core}/src/test/java/org/apache/iceberg/data/DataTestHelpers.java (100%) create mode 100644 core/src/test/java/org/apache/iceberg/io/TestBufferedFileAppender.java create mode 100644 parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java create mode 100644 parquet/src/test/java/org/apache/iceberg/parquet/TestVariantShreddingAnalyzer.java create mode 100644 spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java create mode 100644 spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java diff --git a/core/src/main/java/org/apache/iceberg/TableProperties.java b/core/src/main/java/org/apache/iceberg/TableProperties.java index 7100daef437e..021ef95d9122 100644 --- a/core/src/main/java/org/apache/iceberg/TableProperties.java +++ b/core/src/main/java/org/apache/iceberg/TableProperties.java @@ -158,6 +158,12 @@ private TableProperties() {} "write.delete.parquet.compression-level"; public static final String PARQUET_COMPRESSION_LEVEL_DEFAULT = null; + public static final String PARQUET_SHRED_VARIANTS = "write.parquet.shred-variants"; + public static final boolean PARQUET_SHRED_VARIANTS_DEFAULT = false; + public static final String PARQUET_VARIANT_BUFFER_SIZE = + "write.parquet.variant-inference-buffer-size"; + public static final int PARQUET_VARIANT_BUFFER_SIZE_DEFAULT = 100; + public static final String PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = "write.parquet.row-group-check-min-record-count"; public static final String DELETE_PARQUET_ROW_GROUP_CHECK_MIN_RECORD_COUNT = diff --git a/core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java b/core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java new file mode 100644 index 000000000000..8f8ef8f33b76 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/io/BufferedFileAppender.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.io; + +import java.io.IOException; +import java.util.List; +import java.util.function.Function; +import java.util.function.UnaryOperator; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * A FileAppender that buffers the first N rows, then creates a delegate appender via a factory. + * + *

      The factory receives the buffered rows and is responsible for creating the real appender. Row + * replay is handled internally. All subsequent {@link #add} calls delegate directly to the real + * appender. + * + *

      If fewer than {@code bufferSize} rows are written before close, the factory is called with + * whatever rows were buffered. If no rows were written, the factory is not called and no file is + * created on disk. In this case, {@link #metrics()} returns {@code new Metrics(0L)} and {@link + * #length()} returns {@code 0L}. + * + * @param the row type + */ +public class BufferedFileAppender implements FileAppender { + private final int bufferRowCount; + private final Function, FileAppender> appenderFactory; + private final UnaryOperator copyFunc; + private List buffer; + private FileAppender delegate; + private boolean closed = false; + + /** + * @param bufferRowCount number of rows to buffer before creating the delegate appender + * @param appenderFactory given the buffered rows, creates the delegate appender + */ + public BufferedFileAppender( + int bufferRowCount, Function, FileAppender> appenderFactory) { + this(bufferRowCount, appenderFactory, UnaryOperator.identity()); + } + + /** + * @param bufferRowCount number of rows to buffer before creating the delegate appender + * @param appenderFactory given the buffered rows, creates the delegate appender + * @param copyFunc copies a row before buffering (needed when row objects are reused, e.g. Spark + * InternalRow) + */ + public BufferedFileAppender( + int bufferRowCount, + Function, FileAppender> appenderFactory, + UnaryOperator copyFunc) { + Preconditions.checkArgument( + bufferRowCount > 0, "bufferRowCount must be > 0, got %s", bufferRowCount); + Preconditions.checkNotNull(appenderFactory, "appenderFactory must not be null"); + Preconditions.checkNotNull(copyFunc, "copyFunc must not be null"); + this.bufferRowCount = bufferRowCount; + this.appenderFactory = appenderFactory; + this.copyFunc = copyFunc; + this.buffer = Lists.newArrayListWithCapacity(bufferRowCount); + } + + @Override + public void add(D datum) { + Preconditions.checkState(!closed, "Cannot add to a closed appender"); + if (delegate != null) { + delegate.add(datum); + } else { + buffer.add(copyFunc.apply(datum)); + if (buffer.size() >= bufferRowCount) { + initialize(); + } + } + } + + @Override + public Metrics metrics() { + Preconditions.checkState(closed, "Cannot return metrics for unclosed appender"); + if (delegate == null) { + return new Metrics(0L); + } + + return delegate.metrics(); + } + + @Override + public long length() { + if (delegate != null) { + return delegate.length(); + } + + // No bytes written to disk yet; data is buffered in memory + return 0L; + } + + @Override + public List splitOffsets() { + if (delegate != null) { + return delegate.splitOffsets(); + } + + return null; + } + + @Override + public void close() throws IOException { + if (!closed) { + try { + if (delegate == null && buffer != null && !buffer.isEmpty()) { + initialize(); + } + + if (delegate != null) { + delegate.close(); + } + } finally { + this.closed = true; + this.buffer = null; + } + } + } + + private void initialize() { + delegate = appenderFactory.apply(buffer); + Preconditions.checkState(delegate != null, "appenderFactory must not return null"); + try { + buffer.forEach(delegate::add); + } finally { + buffer = null; + } + } +} diff --git a/data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java b/core/src/test/java/org/apache/iceberg/data/DataTestHelpers.java similarity index 100% rename from data/src/test/java/org/apache/iceberg/data/DataTestHelpers.java rename to core/src/test/java/org/apache/iceberg/data/DataTestHelpers.java diff --git a/core/src/test/java/org/apache/iceberg/io/TestBufferedFileAppender.java b/core/src/test/java/org/apache/iceberg/io/TestBufferedFileAppender.java new file mode 100644 index 000000000000..9bbc0f9f8c71 --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/io/TestBufferedFileAppender.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.io; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.IOException; +import java.util.List; +import java.util.function.Function; +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.avro.AvroIterable; +import org.apache.iceberg.data.DataTestHelpers; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.avro.DataWriter; +import org.apache.iceberg.data.avro.PlannedDataReader; +import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.inmemory.InMemoryOutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class TestBufferedFileAppender { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + private InMemoryOutputFile outputFile; + private GenericRecord record; + + @BeforeEach + public void before() { + this.outputFile = new InMemoryOutputFile(); + this.record = GenericRecord.create(SCHEMA); + } + + private Function, FileAppender> avroFactory(OutputFile out) { + return bufferedRows -> { + try { + return Avro.write(out) + .createWriterFunc(DataWriter::create) + .schema(SCHEMA) + .overwrite() + .build(); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + }; + } + + private BufferedFileAppender createAppender(int bufferSize) { + return new BufferedFileAppender<>(bufferSize, avroFactory(outputFile), Record::copy); + } + + private Record createRecord(long id, String data) { + return record.copy(ImmutableMap.of("id", id, "data", data)); + } + + private List readBack() throws IOException { + try (AvroIterable reader = + Avro.read(outputFile.toInputFile()) + .project(SCHEMA) + .createResolvingReader(PlannedDataReader::create) + .build()) { + return Lists.newArrayList(reader); + } + } + + @Test + public void testBufferFlushesOnThreshold() throws IOException { + BufferedFileAppender appender = createAppender(3); + + appender.add(createRecord(1L, "a")); + appender.add(createRecord(2L, "b")); + + // delegate not yet created, length should be 0 + assertThat(appender.length()).isEqualTo(0L); + + appender.add(createRecord(3L, "c")); + + // delegate created after 3rd row, length should be > 0 + assertThat(appender.length()).isGreaterThan(0L); + + appender.add(createRecord(4L, "d")); + appender.add(createRecord(5L, "e")); + appender.close(); + + List expected = + Lists.newArrayList( + createRecord(1L, "a"), + createRecord(2L, "b"), + createRecord(3L, "c"), + createRecord(4L, "d"), + createRecord(5L, "e")); + DataTestHelpers.assertEquals(SCHEMA.asStruct(), expected, readBack()); + } + + @Test + public void testCloseWithPartialBuffer() throws IOException { + BufferedFileAppender appender = createAppender(10); + + appender.add(createRecord(1L, "a")); + appender.add(createRecord(2L, "b")); + appender.add(createRecord(3L, "c")); + + // buffer not full yet + assertThat(appender.length()).isEqualTo(0L); + + // close flushes partial buffer through factory + appender.close(); + + List expected = + Lists.newArrayList(createRecord(1L, "a"), createRecord(2L, "b"), createRecord(3L, "c")); + DataTestHelpers.assertEquals(SCHEMA.asStruct(), expected, readBack()); + } + + @Test + public void testCopyFuncIsApplied() throws IOException { + BufferedFileAppender appender = createAppender(3); + + // use a single mutable record, relying on copyFunc to snapshot it + record.set(0, 1L); + record.set(1, "first"); + appender.add(record); + + record.set(0, 2L); + record.set(1, "second"); + appender.add(record); + + record.set(0, 3L); + record.set(1, "third"); + appender.add(record); + + appender.close(); + + List expected = + Lists.newArrayList( + createRecord(1L, "first"), createRecord(2L, "second"), createRecord(3L, "third")); + DataTestHelpers.assertEquals(SCHEMA.asStruct(), expected, readBack()); + } + + @Test + public void testMetricsAfterClose() throws IOException { + BufferedFileAppender appender = createAppender(2); + + appender.add(createRecord(1L, "a")); + appender.add(createRecord(2L, "b")); + appender.add(createRecord(3L, "c")); + appender.close(); + + assertThat(appender.metrics()).isNotNull(); + assertThat(appender.metrics().recordCount()).isEqualTo(3L); + assertThat(appender.length()).isGreaterThan(0L); + } + + @Test + public void testMetricsBeforeCloseThrows() throws IOException { + try (BufferedFileAppender appender = createAppender(10)) { + assertThatThrownBy(appender::metrics) + .isInstanceOf(IllegalStateException.class) + .hasMessage("Cannot return metrics for unclosed appender"); + } + } + + @Test + public void testAddAfterCloseThrows() throws IOException { + try (BufferedFileAppender appender = createAppender(10)) { + appender.add(createRecord(1L, "a")); + appender.close(); + + assertThatThrownBy(() -> appender.add(createRecord(2L, "b"))) + .isInstanceOf(IllegalStateException.class) + .hasMessage("Cannot add to a closed appender"); + } + } + + @Test + public void testAddAllSpanningBuffer() throws IOException { + BufferedFileAppender appender = createAppender(2); + + List records = + Lists.newArrayList( + createRecord(1L, "a"), + createRecord(2L, "b"), + createRecord(3L, "c"), + createRecord(4L, "d")); + + appender.addAll(records); + appender.close(); + + DataTestHelpers.assertEquals(SCHEMA.asStruct(), records, readBack()); + } + + @Test + public void testCloseWithNoData() throws IOException { + BufferedFileAppender appender = createAppender(10); + // close immediately with no data written + appender.close(); + // delegate was never created + assertThat(appender.length()).isEqualTo(0L); + assertThat(appender.metrics()).isNotNull(); + assertThat(appender.metrics().recordCount()).isEqualTo(0L); + assertThat(appender.splitOffsets()).isNull(); + } +} diff --git a/docs/docs/configuration.md b/docs/docs/configuration.md index 88d9872cc683..17bf1f8ac0a1 100644 --- a/docs/docs/configuration.md +++ b/docs/docs/configuration.md @@ -50,6 +50,8 @@ Iceberg tables support table properties to configure table behavior, like the de | write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | | write.parquet.compression-codec | zstd | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | | write.parquet.compression-level | null | Parquet compression level | +| write.parquet.shred-variants | false | When true, variant columns are written with shredded Parquet encoding for improved query performance | +| write.parquet.variant-inference-buffer-size | 100 | Number of rows to buffer for schema inference when variant shredding is enabled | | write.parquet.bloom-filter-enabled.column.col1 | (not set) | Hint to parquet to write a bloom filter for the column: 'col1' | | write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | | write.parquet.bloom-filter-fpp.column.col1 | 0.01 | The false positive probability for a bloom filter applied to 'col1' (must > 0.0 and < 1.0) | diff --git a/docs/docs/spark-configuration.md b/docs/docs/spark-configuration.md index 2c15c3bbd7a5..5972aafc3d39 100644 --- a/docs/docs/spark-configuration.md +++ b/docs/docs/spark-configuration.md @@ -191,6 +191,8 @@ val spark = SparkSession.builder() | spark.sql.iceberg.distribution-mode | See [Spark Writes](spark-writes.md#writing-distribution-modes) | Controls distribution strategy during writes | | spark.wap.id | null | [Write-Audit-Publish](branching.md#audit-branch) snapshot staging ID | | spark.wap.branch | null | WAP branch name for snapshot commit | +| spark.sql.iceberg.shred-variants | Table default | When true, variant columns are written with shredded Parquet encoding for improved query performance | +| spark.sql.iceberg.variant-inference-buffer-size | Table default | Number of rows to buffer for schema inference when variant shredding is enabled | | spark.sql.iceberg.compression-codec | Table default | Write compression codec (e.g., `zstd`, `snappy`) | | spark.sql.iceberg.compression-level | Table default | Compression level for Parquet/Avro | | spark.sql.iceberg.compression-strategy | Table default | Compression strategy for ORC | @@ -262,6 +264,8 @@ df.writeTo("catalog.db.table") | compression-strategy | Table write.orc.compression-strategy | Overrides this table's compression strategy for ORC tables for this write | | distribution-mode | See [Spark Writes](spark-writes.md#writing-distribution-modes) for defaults | Override this table's distribution mode for this write | | delete-granularity | file | Override this table's delete granularity for this write | +| shred-variants | false | Overrides this table's write.parquet.shred-variants for this write | +| variant-inference-buffer-size | 100 | Overrides this table's write.parquet.variant-inference-buffer-size for this write | CommitMetadata provides an interface to add custom metadata to a snapshot summary during a SQL execution, which can be beneficial for purposes such as auditing or change tracking. If properties start with `snapshot-property.`, then that prefix will be removed from each property. Here is an example: diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 35a802460710..90dd6e117ba8 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -19,13 +19,16 @@ package org.apache.iceberg.parquet; import java.io.IOException; +import java.io.UncheckedIOException; import java.nio.ByteBuffer; import java.util.Map; import java.util.function.Function; +import java.util.function.UnaryOperator; import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.Schema; +import org.apache.iceberg.TableProperties; import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.encryption.EncryptedOutputFile; @@ -33,6 +36,7 @@ import org.apache.iceberg.formats.BaseFormatModel; import org.apache.iceberg.formats.ModelWriteBuilder; import org.apache.iceberg.formats.ReadBuilder; +import org.apache.iceberg.io.BufferedFileAppender; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.FileAppender; @@ -41,13 +45,17 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Type; public class ParquetFormatModel extends BaseFormatModel, R, MessageType> { private final boolean isBatchReader; + private final VariantShreddingAnalyzer variantAnalyzer; + private final UnaryOperator copyFunc; public static ParquetFormatModel, Void, Object> forPositionDeletes() { - return new ParquetFormatModel<>(PositionDelete.deleteClass(), Void.class, null, null, false); + return new ParquetFormatModel<>( + PositionDelete.deleteClass(), Void.class, null, null, false, null, null); } public static ParquetFormatModel> create( @@ -55,14 +63,26 @@ public static ParquetFormatModel> create( Class schemaType, WriterFunction, S, MessageType> writerFunction, ReaderFunction, S, MessageType> readerFunction) { - return new ParquetFormatModel<>(type, schemaType, writerFunction, readerFunction, false); + return new ParquetFormatModel<>( + type, schemaType, writerFunction, readerFunction, false, null, null); + } + + public static ParquetFormatModel> create( + Class type, + Class schemaType, + WriterFunction, S, MessageType> writerFunction, + ReaderFunction, S, MessageType> readerFunction, + VariantShreddingAnalyzer variantAnalyzer, + UnaryOperator copyFunc) { + return new ParquetFormatModel<>( + type, schemaType, writerFunction, readerFunction, false, variantAnalyzer, copyFunc); } public static ParquetFormatModel> create( Class type, Class schemaType, ReaderFunction, S, MessageType> batchReaderFunction) { - return new ParquetFormatModel<>(type, schemaType, null, batchReaderFunction, true); + return new ParquetFormatModel<>(type, schemaType, null, batchReaderFunction, true, null, null); } private ParquetFormatModel( @@ -70,9 +90,13 @@ private ParquetFormatModel( Class schemaType, WriterFunction, S, MessageType> writerFunction, ReaderFunction readerFunction, - boolean isBatchReader) { + boolean isBatchReader, + VariantShreddingAnalyzer variantAnalyzer, + UnaryOperator copyFunc) { super(type, schemaType, writerFunction, readerFunction); this.isBatchReader = isBatchReader; + this.variantAnalyzer = variantAnalyzer; + this.copyFunc = copyFunc; } @Override @@ -82,7 +106,7 @@ public FileFormat format() { @Override public ModelWriteBuilder writeBuilder(EncryptedOutputFile outputFile) { - return new WriteBuilderWrapper<>(outputFile, writerFunction()); + return new WriteBuilderWrapper<>(outputFile, writerFunction(), variantAnalyzer, copyFunc); } @Override @@ -93,15 +117,23 @@ public ReadBuilder readBuilder(InputFile inputFile) { private static class WriteBuilderWrapper implements ModelWriteBuilder { private final Parquet.WriteBuilder internal; private final WriterFunction, S, MessageType> writerFunction; + private final VariantShreddingAnalyzer variantAnalyzer; + private final UnaryOperator copyFunc; private Schema schema; private S engineSchema; private FileContent content; + private boolean shreddingEnabled = false; + private int bufferSize = TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT; private WriteBuilderWrapper( EncryptedOutputFile outputFile, - WriterFunction, S, MessageType> writerFunction) { + WriterFunction, S, MessageType> writerFunction, + VariantShreddingAnalyzer variantAnalyzer, + UnaryOperator copyFunc) { this.internal = Parquet.write(outputFile); this.writerFunction = writerFunction; + this.variantAnalyzer = variantAnalyzer; + this.copyFunc = copyFunc; } @Override @@ -119,13 +151,21 @@ public ModelWriteBuilder engineSchema(S newSchema) { @Override public ModelWriteBuilder set(String property, String value) { + if (TableProperties.PARQUET_SHRED_VARIANTS.equals(property)) { + shreddingEnabled = Boolean.parseBoolean(value); + } + + if (TableProperties.PARQUET_VARIANT_BUFFER_SIZE.equals(property)) { + bufferSize = Integer.parseInt(value); + } + internal.set(property, value); return this; } @Override public ModelWriteBuilder setAll(Map properties) { - internal.setAll(properties); + properties.forEach(this::set); return this; } @@ -173,12 +213,14 @@ public ModelWriteBuilder withAADPrefix(ByteBuffer aadPrefix) { @Override public FileAppender build() throws IOException { + boolean shredVariants = false; switch (content) { case DATA: internal.createContextFunc(Parquet.WriteBuilder.Context::dataContext); internal.createWriterFunc( (icebergSchema, messageType) -> writerFunction.write(icebergSchema, messageType, engineSchema)); + shredVariants = shreddingEnabled && variantAnalyzer != null && hasVariantColumns(schema); break; case EQUALITY_DELETES: internal.createContextFunc(Parquet.WriteBuilder.Context::deleteContext); @@ -209,8 +251,45 @@ public FileAppender build() throws IOException { throw new IllegalArgumentException("Unknown file content: " + content); } + if (shredVariants) { + return buildShreddedAppender(); + } + return internal.build(); } + + /** + * Creates a {@link BufferedFileAppender} that buffers the first N rows, runs variant shredding + * analysis on them, then creates the real Parquet appender with a shredded schema. + * + *

      Only top-level variant columns are shredded. Nested variants (inside structs/lists/maps) + * fall through to unshredded 2-field layout because column index resolution only applies to + * top-level fields. + */ + private FileAppender buildShreddedAppender() { + return new BufferedFileAppender<>( + bufferSize, + bufferedRows -> { + Map shreddedTypes = + variantAnalyzer.analyzeVariantColumns(bufferedRows, schema, engineSchema); + + if (!shreddedTypes.isEmpty()) { + internal.variantShreddingFunc((fieldId, name) -> shreddedTypes.get(fieldId)); + } + + try { + return internal.build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to create shredded variant writer", e); + } + }, + copyFunc); + } + + private static boolean hasVariantColumns(Schema schema) { + return schema != null + && schema.columns().stream().anyMatch(field -> field.type().isVariantType()); + } } private static class ReadBuilderWrapper implements ReadBuilder { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java index 9e94b1bbd6cd..e5c56da166f4 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantWriters.java @@ -18,6 +18,8 @@ */ package org.apache.iceberg.parquet; +import java.math.BigDecimal; +import java.math.RoundingMode; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.Arrays; @@ -99,6 +101,16 @@ static ParquetValueWriter objects( builder.build()); } + @SuppressWarnings("unchecked") + static ParquetValueWriter decimal( + ParquetValueWriter writer, int precision, int scale, PhysicalType... types) { + return new DecimalPrimitiveWriter( + (ParquetValueWriter) writer, + Sets.immutableEnumSet(Arrays.asList(types)), + precision, + scale); + } + @SuppressWarnings("unchecked") public static ParquetValueWriter array( int repeatedDefinitionLevel, @@ -220,6 +232,10 @@ protected int writeTo(ByteBuffer buffer, int offset, VariantValue value) { private interface TypedWriter extends ParquetValueWriter { Set types(); + + default boolean canWrite(VariantValue value) { + return true; + } } private static class PrimitiveWriter implements TypedWriter { @@ -274,7 +290,7 @@ private ShreddedVariantWriter( @Override public void write(int repetitionLevel, VariantValue value) { - if (typedWriter.types().contains(value.type())) { + if (typedWriter.types().contains(value.type()) && typedWriter.canWrite(value)) { typedWriter.write(repetitionLevel, value); writeNull(valueWriter, repetitionLevel, valueDefinitionLevel); } else { @@ -372,6 +388,49 @@ public void setColumnStore(ColumnWriteStore columnStore) { } } + private static class DecimalPrimitiveWriter implements TypedWriter { + private final Set types; + private final ParquetValueWriter writer; + private final int precision; + private final int scale; + + private DecimalPrimitiveWriter( + ParquetValueWriter writer, Set types, int precision, int scale) { + this.types = types; + this.writer = writer; + this.precision = precision; + this.scale = scale; + } + + @Override + public Set types() { + return types; + } + + @Override + public boolean canWrite(VariantValue value) { + BigDecimal decimal = (BigDecimal) value.asPrimitive().get(); + int integerDigits = decimal.precision() - decimal.scale(); + return decimal.scale() <= scale && integerDigits + scale <= precision; + } + + @Override + public void write(int repetitionLevel, VariantValue value) { + BigDecimal decimal = (BigDecimal) value.asPrimitive().get(); + writer.write(repetitionLevel, decimal.setScale(scale, RoundingMode.UNNECESSARY)); + } + + @Override + public List> columns() { + return writer.columns(); + } + + @Override + public void setColumnStore(ColumnWriteStore columnStore) { + writer.setColumnStore(columnStore); + } + } + private static class ArrayWriter implements TypedWriter { private final int definitionLevel; private final int repetitionLevel; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java b/parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java new file mode 100644 index 000000000000..d2a058c1128a --- /dev/null +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VariantShreddingAnalyzer.java @@ -0,0 +1,532 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Map; +import java.util.Set; +import org.apache.iceberg.Schema; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.Types.NestedField; +import org.apache.iceberg.variants.PhysicalType; +import org.apache.iceberg.variants.VariantArray; +import org.apache.iceberg.variants.VariantObject; +import org.apache.iceberg.variants.VariantPrimitive; +import org.apache.iceberg.variants.VariantValue; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.parquet.schema.Types; + +/** + * Analyzes variant data across buffered rows to determine an optimal shredding schema. + * + *

      Determinism contract: for a given set of variant values (regardless of row arrival order), + * this analyzer produces the same shredded schema. When the number of distinct fields at any level + * exceeds {@code MAX_INTERMEDIATE_FIELDS}, field tracking becomes insertion-order dependent and + * determinism is not guaranteed. + * + *

        + *
      • Object fields use a TreeMap, so field ordering is alphabetical and deterministic. + *
      • Type selection picks the most common type with explicit tie-break priority (see + * TIE_BREAK_PRIORITY), not enum ordinal. + *
      • Integer types (INT8/16/32/64) and decimal types (DECIMAL4/8/16) are each promoted to the + * widest observed before competing with other types. + *
      • Fields below {@code MIN_FIELD_FREQUENCY} are pruned. Above {@code MAX_SHREDDED_FIELDS}, the + * most frequent are kept with alphabetical tie-breaking. + *
      • Recursion into nested objects/arrays stops at {@code MAX_SHREDDING_DEPTH} (default 50). + *
      • New struct fields are not tracked once a node reaches {@code MAX_INTERMEDIATE_FIELDS} + * (default 1000) to bound memory during inference. + *
      + * + *

      This contract holds within a single batch. Different batches with different distributions may + * produce different layouts; cross-batch stability requires schema pinning (not yet implemented). + * + *

      Subclasses implement {@link #extractVariantValues} to convert engine-specific row types into + * {@link VariantValue} instances. + * + * @param the engine-specific row type (e.g., Spark InternalRow, Flink RowData) + * @param the engine-specific schema type (e.g., Spark StructType, Flink RowType) + */ +public abstract class VariantShreddingAnalyzer { + private static final String TYPED_VALUE = "typed_value"; + private static final String VALUE = "value"; + private static final String ELEMENT = "element"; + private static final double MIN_FIELD_FREQUENCY = 0.10; + private static final int MAX_SHREDDED_FIELDS = 300; + private static final int MAX_SHREDDING_DEPTH = 50; + private static final int MAX_INTERMEDIATE_FIELDS = 1000; + + protected VariantShreddingAnalyzer() {} + + /** + * Analyzes buffered variant values to determine the optimal shredding schema. + * + * @param bufferedRows the buffered rows to analyze + * @param variantFieldIndex the index of the variant field in the rows + * @return the shredded schema type, or null if no shredding should be performed + */ + public Type analyzeAndCreateSchema(List bufferedRows, int variantFieldIndex) { + List variantValues = extractVariantValues(bufferedRows, variantFieldIndex); + if (variantValues.isEmpty()) { + return null; + } + + PathNode root = buildPathTree(variantValues); + PhysicalType rootType = root.info.getMostCommonType(); + if (rootType == null) { + return null; + } + + pruneInfrequentFields(root, root.info.observationCount); + + return buildTypedValue(root, rootType); + } + + protected abstract List extractVariantValues( + List bufferedRows, int variantFieldIndex); + + /** + * Resolves a column name to its index in the engine-specific schema. Returns -1 if the column is + * not found. + */ + protected abstract int resolveColumnIndex(S engineSchema, String columnName); + + /** + * Analyzes all variant columns in the schema, resolving column indices via the engine-specific + * {@link #resolveColumnIndex} method. + * + * @param bufferedRows the buffered rows to analyze + * @param icebergSchema the Iceberg table schema + * @param engineSchema the engine-specific schema used to resolve column indices + * @return a map from Iceberg field ID to the shredded Parquet type for each variant column + */ + public Map analyzeVariantColumns( + List bufferedRows, Schema icebergSchema, S engineSchema) { + Map shreddedTypes = Maps.newHashMap(); + for (NestedField col : icebergSchema.columns()) { + if (col.type().isVariantType()) { + int rowIndex = resolveColumnIndex(engineSchema, col.name()); + if (rowIndex >= 0) { + Type typed = analyzeAndCreateSchema(bufferedRows, rowIndex); + if (typed != null) { + shreddedTypes.put(col.fieldId(), typed); + } + } + } + } + + return shreddedTypes; + } + + private static PathNode buildPathTree(List variantValues) { + PathNode root = new PathNode(null); + root.info = new FieldInfo(); + + for (VariantValue value : variantValues) { + traverse(root, value, 0); + } + + return root; + } + + private static void pruneInfrequentFields(PathNode node, int totalRows) { + if (node.objectChildren.isEmpty() && node.arrayElement == null) { + return; + } + + // Remove fields below frequency threshold + node.objectChildren + .entrySet() + .removeIf( + entry -> { + FieldInfo info = entry.getValue().info; + return info != null + && ((double) info.observationCount / totalRows) < MIN_FIELD_FREQUENCY; + }); + + // Cap at MAX_SHREDDED_FIELDS, keep the most frequently observed + if (node.objectChildren.size() > MAX_SHREDDED_FIELDS) { + List> sorted = Lists.newArrayList(node.objectChildren.entrySet()); + sorted.sort( + (a, b) -> { + int cmp = + Integer.compare( + b.getValue().info.observationCount, a.getValue().info.observationCount); + return cmp != 0 ? cmp : a.getKey().compareTo(b.getKey()); + }); + Set keep = Sets.newHashSet(); + for (int i = 0; i < MAX_SHREDDED_FIELDS; i++) { + keep.add(sorted.get(i).getKey()); + } + node.objectChildren.entrySet().removeIf(entry -> !keep.contains(entry.getKey())); + } + + // Recurse into remaining object children + for (PathNode child : node.objectChildren.values()) { + pruneInfrequentFields(child, totalRows); + } + + // Recurse into array elements (arrays of objects need pruning too) + if (node.arrayElement != null) { + pruneInfrequentFields(node.arrayElement, totalRows); + } + } + + private static void traverse(PathNode node, VariantValue value, int depth) { + if (value == null || value.type() == PhysicalType.NULL) { + return; + } + + node.info.observe(value); + + if (value.type() == PhysicalType.OBJECT && depth < MAX_SHREDDING_DEPTH) { + traverseObject(node, value.asObject(), depth); + } else if (value.type() == PhysicalType.ARRAY && depth < MAX_SHREDDING_DEPTH) { + traverseArray(node, value.asArray(), depth); + } + } + + private static void traverseObject(PathNode node, VariantObject obj, int depth) { + for (String fieldName : obj.fieldNames()) { + VariantValue fieldValue = obj.get(fieldName); + if (fieldValue != null) { + PathNode childNode = node.objectChildren.get(fieldName); + if (childNode == null) { + if (node.objectChildren.size() >= MAX_INTERMEDIATE_FIELDS) { + continue; + } + childNode = new PathNode(fieldName); + childNode.info = new FieldInfo(); + node.objectChildren.put(fieldName, childNode); + } + traverse(childNode, fieldValue, depth + 1); + } + } + } + + // observationCount inside arrays counts per-element, not per-row, so fields in long arrays + // have inflated frequency and resist pruning. + private static void traverseArray(PathNode node, VariantArray array, int depth) { + int numElements = array.numElements(); + if (node.arrayElement == null) { + node.arrayElement = new PathNode(null); + node.arrayElement.info = new FieldInfo(); + } + for (int i = 0; i < numElements; i++) { + VariantValue element = array.get(i); + if (element != null) { + traverse(node.arrayElement, element, depth + 1); + } + } + } + + private static Type buildFieldGroup(PathNode node) { + PhysicalType commonType = node.info.getMostCommonType(); + if (commonType == null) { + return null; + } + + Type typedValue = buildTypedValue(node, commonType); + if (typedValue == null) { + return null; + } + + return Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named(VALUE) + .addField(typedValue) + .named(node.fieldName); + } + + private static Type buildTypedValue(PathNode node, PhysicalType physicalType) { + return switch (physicalType) { + case ARRAY -> createArrayTypedValue(node); + case OBJECT -> createObjectTypedValue(node); + default -> createPrimitiveTypedValue(node.info, physicalType); + }; + } + + private static Type createObjectTypedValue(PathNode node) { + if (node.objectChildren.isEmpty()) { + return null; + } + + Types.GroupBuilder builder = Types.buildGroup(Type.Repetition.OPTIONAL); + boolean hasFields = false; + for (PathNode child : node.objectChildren.values()) { + Type fieldType = buildFieldGroup(child); + if (fieldType != null) { + builder.addField(fieldType); + hasFields = true; + } + } + + return hasFields ? builder.named(TYPED_VALUE) : null; + } + + private static Type createArrayTypedValue(PathNode node) { + PathNode elementNode = node.arrayElement; + if (elementNode == null) { + return null; + } + PhysicalType elementType = elementNode.info.getMostCommonType(); + if (elementType == null) { + return null; + } + Type elementTypedValue = buildTypedValue(elementNode, elementType); + if (elementTypedValue == null) { + return null; + } + + GroupType elementGroup = + Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named(VALUE) + .addField(elementTypedValue) + .named(ELEMENT); + + return Types.optionalList().element(elementGroup).named(TYPED_VALUE); + } + + private static class PathNode { + private final String fieldName; + private final Map objectChildren = Maps.newTreeMap(); + private PathNode arrayElement = null; + private FieldInfo info = null; + + private PathNode(String fieldName) { + this.fieldName = fieldName; + } + } + + /** Use DECIMAL with maximum precision and scale as the shredding type */ + private static Type createDecimalTypedValue(FieldInfo info) { + int maxPrecision = Math.min(info.maxDecimalIntegerDigits + info.maxDecimalScale, 38); + int maxScale = Math.min(info.maxDecimalScale, Math.max(0, 38 - info.maxDecimalIntegerDigits)); + + if (maxPrecision <= 9) { + return Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.decimalType(maxScale, maxPrecision)) + .named(TYPED_VALUE); + } else if (maxPrecision <= 18) { + return Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.decimalType(maxScale, maxPrecision)) + .named(TYPED_VALUE); + } else { + return Types.optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.decimalType(maxScale, maxPrecision)) + .named(TYPED_VALUE); + } + } + + private static Type createPrimitiveTypedValue(FieldInfo info, PhysicalType primitiveType) { + return switch (primitiveType) { + case BOOLEAN_TRUE, BOOLEAN_FALSE -> + Types.optional(PrimitiveType.PrimitiveTypeName.BOOLEAN).named(TYPED_VALUE); + case INT8 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(8, true)) + .named(TYPED_VALUE); + case INT16 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(16, true)) + .named(TYPED_VALUE); + case INT32 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.intType(32, true)) + .named(TYPED_VALUE); + case INT64 -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.intType(64, true)) + .named(TYPED_VALUE); + case FLOAT -> Types.optional(PrimitiveType.PrimitiveTypeName.FLOAT).named(TYPED_VALUE); + case DOUBLE -> Types.optional(PrimitiveType.PrimitiveTypeName.DOUBLE).named(TYPED_VALUE); + case STRING -> + Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named(TYPED_VALUE); + case BINARY -> Types.optional(PrimitiveType.PrimitiveTypeName.BINARY).named(TYPED_VALUE); + case TIME -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timeType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named(TYPED_VALUE); + case DATE -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT32) + .as(LogicalTypeAnnotation.dateType()) + .named(TYPED_VALUE); + case TIMESTAMPTZ -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named(TYPED_VALUE); + case TIMESTAMPNTZ -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS)) + .named(TYPED_VALUE); + case TIMESTAMPTZ_NANOS -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named(TYPED_VALUE); + case TIMESTAMPNTZ_NANOS -> + Types.optional(PrimitiveType.PrimitiveTypeName.INT64) + .as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.NANOS)) + .named(TYPED_VALUE); + case DECIMAL4, DECIMAL8, DECIMAL16 -> createDecimalTypedValue(info); + case UUID -> + Types.optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.uuidType()) + .named(TYPED_VALUE); + default -> + throw new UnsupportedOperationException( + "Unknown primitive physical type: " + primitiveType); + }; + } + + /** Tracks occurrence count and types for a single field. */ + private static class FieldInfo { + private final Map typeCounts = Maps.newHashMap(); + private int maxDecimalScale = 0; + private int maxDecimalIntegerDigits = 0; + private int observationCount = 0; + + private static final Map INTEGER_PRIORITY = + ImmutableMap.of( + PhysicalType.INT8, 0, + PhysicalType.INT16, 1, + PhysicalType.INT32, 2, + PhysicalType.INT64, 3); + + private static final Map DECIMAL_PRIORITY = + ImmutableMap.of( + PhysicalType.DECIMAL4, 0, + PhysicalType.DECIMAL8, 1, + PhysicalType.DECIMAL16, 2); + + private static final Map TIE_BREAK_PRIORITY = + ImmutableMap.builder() + .put(PhysicalType.BOOLEAN_TRUE, 0) + .put(PhysicalType.INT8, 1) + .put(PhysicalType.INT16, 2) + .put(PhysicalType.INT32, 3) + .put(PhysicalType.INT64, 4) + .put(PhysicalType.FLOAT, 5) + .put(PhysicalType.DOUBLE, 6) + .put(PhysicalType.DECIMAL4, 7) + .put(PhysicalType.DECIMAL8, 8) + .put(PhysicalType.DECIMAL16, 9) + .put(PhysicalType.DATE, 10) + .put(PhysicalType.TIME, 11) + .put(PhysicalType.TIMESTAMPTZ, 12) + .put(PhysicalType.TIMESTAMPNTZ, 13) + .put(PhysicalType.BINARY, 14) + .put(PhysicalType.STRING, 15) + .put(PhysicalType.TIMESTAMPTZ_NANOS, 16) + .put(PhysicalType.TIMESTAMPNTZ_NANOS, 17) + .put(PhysicalType.UUID, 18) + .buildOrThrow(); + + void observe(VariantValue value) { + observationCount++; + // Use BOOLEAN_TRUE for both TRUE/FALSE values + PhysicalType type = + value.type() == PhysicalType.BOOLEAN_FALSE ? PhysicalType.BOOLEAN_TRUE : value.type(); + + typeCounts.compute(type, (k, v) -> (v == null) ? 1 : v + 1); + + // Track max precision and scale for decimal types + if (type == PhysicalType.DECIMAL4 + || type == PhysicalType.DECIMAL8 + || type == PhysicalType.DECIMAL16) { + VariantPrimitive primitive = value.asPrimitive(); + Object decimalValue = primitive.get(); + if (decimalValue instanceof BigDecimal bd) { + maxDecimalIntegerDigits = Math.max(maxDecimalIntegerDigits, bd.precision() - bd.scale()); + maxDecimalScale = Math.max(maxDecimalScale, bd.scale()); + } + } + } + + PhysicalType getMostCommonType() { + Map combinedCounts = Maps.newHashMap(); + + int integerTotalCount = 0; + PhysicalType mostCapableInteger = null; + + int decimalTotalCount = 0; + PhysicalType mostCapableDecimal = null; + + for (Map.Entry entry : typeCounts.entrySet()) { + PhysicalType type = entry.getKey(); + int count = entry.getValue(); + + if (isIntegerType(type)) { + integerTotalCount += count; + if (mostCapableInteger == null + || INTEGER_PRIORITY.get(type) > INTEGER_PRIORITY.get(mostCapableInteger)) { + mostCapableInteger = type; + } + } else if (isDecimalType(type)) { + decimalTotalCount += count; + if (mostCapableDecimal == null + || DECIMAL_PRIORITY.get(type) > DECIMAL_PRIORITY.get(mostCapableDecimal)) { + mostCapableDecimal = type; + } + } else { + combinedCounts.put(type, count); + } + } + + if (mostCapableInteger != null) { + combinedCounts.put(mostCapableInteger, integerTotalCount); + } + + if (mostCapableDecimal != null) { + combinedCounts.put(mostCapableDecimal, decimalTotalCount); + } + + // Pick the most common type with tie-breaking + return combinedCounts.entrySet().stream() + .max( + Map.Entry.comparingByValue() + .thenComparingInt(entry -> TIE_BREAK_PRIORITY.getOrDefault(entry.getKey(), -1))) + .map(Map.Entry::getKey) + .orElse(null); + } + + private static boolean isIntegerType(PhysicalType type) { + return type == PhysicalType.INT8 + || type == PhysicalType.INT16 + || type == PhysicalType.INT32 + || type == PhysicalType.INT64; + } + + private static boolean isDecimalType(PhysicalType type) { + return type == PhysicalType.DECIMAL4 + || type == PhysicalType.DECIMAL8 + || type == PhysicalType.DECIMAL16; + } + } +} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java b/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java index a447a102690a..da409c92f113 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VariantWriterBuilder.java @@ -202,23 +202,29 @@ public Optional> visit(DecimalLogicalTypeAnnotation decima case FIXED_LEN_BYTE_ARRAY: case BINARY: writer = - ParquetVariantWriters.primitive( + ParquetVariantWriters.decimal( ParquetValueWriters.decimalAsFixed( desc, decimal.getPrecision(), decimal.getScale()), + decimal.getPrecision(), + decimal.getScale(), PhysicalType.DECIMAL16); return Optional.of(writer); case INT64: writer = - ParquetVariantWriters.primitive( + ParquetVariantWriters.decimal( ParquetValueWriters.decimalAsLong( desc, decimal.getPrecision(), decimal.getScale()), + decimal.getPrecision(), + decimal.getScale(), PhysicalType.DECIMAL8); return Optional.of(writer); case INT32: writer = - ParquetVariantWriters.primitive( + ParquetVariantWriters.decimal( ParquetValueWriters.decimalAsInteger( desc, decimal.getPrecision(), decimal.getScale()), + decimal.getPrecision(), + decimal.getScale(), PhysicalType.DECIMAL4); return Optional.of(writer); } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java index 3918fdc63084..36e254628a6a 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetDataWriter.java @@ -42,8 +42,11 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.encryption.EncryptedFiles; +import org.apache.iceberg.io.BufferedFileAppender; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -52,8 +55,12 @@ import org.apache.iceberg.variants.Variant; import org.apache.iceberg.variants.VariantMetadata; import org.apache.iceberg.variants.VariantTestUtil; +import org.apache.iceberg.variants.VariantValue; import org.apache.iceberg.variants.Variants; +import org.apache.parquet.example.data.Group; import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.MessageType; import org.junit.jupiter.api.BeforeEach; @@ -331,4 +338,207 @@ public void testDataWriterWithVariantShredding() throws IOException { testDataWriter( variantSchema, (id, name) -> ParquetVariantUtil.toParquetSchema(variant.value())); } + + @Test + public void testShreddingWriteReturnsBufferedAppender() throws IOException { + Schema variantSchema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "v", Types.VariantType.get())); + + VariantShreddingAnalyzer testAnalyzer = + new VariantShreddingAnalyzer() { + @Override + protected List extractVariantValues(List rows, int idx) { + return java.util.Collections.emptyList(); + } + + @Override + protected int resolveColumnIndex(Void engineSchema, String columnName) { + return -1; + } + }; + + OutputFile outputFile = Files.localOutput(createTempFile(temp)); + + ParquetFormatModel> model = + ParquetFormatModel.create( + Record.class, + Void.class, + (icebergSchema, messageType, engineSchema) -> + GenericParquetWriter.create(icebergSchema, messageType), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericParquetReaders.buildReader(icebergSchema, fileSchema), + testAnalyzer, + record -> record); + + try (FileAppender appender = + model + .writeBuilder(EncryptedFiles.plainAsEncryptedOutput(outputFile)) + .schema(variantSchema) + .setAll(ImmutableMap.of(TableProperties.PARQUET_SHRED_VARIANTS, "true")) + .content(FileContent.DATA) + .build()) { + assertThat(appender).isInstanceOf(BufferedFileAppender.class); + } + } + + @Test + public void testWriteBuilderReturnsDirectAppenderWithNullAnalyzer() throws IOException { + Schema variantSchema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "v", Types.VariantType.get())); + + OutputFile outputFile = Files.localOutput(createTempFile(temp)); + + ParquetFormatModel> model = + ParquetFormatModel.create( + Record.class, + Void.class, + (icebergSchema, messageType, engineSchema) -> + GenericParquetWriter.create(icebergSchema, messageType), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericParquetReaders.buildReader(icebergSchema, fileSchema), + null, + null); + + try (FileAppender appender = + model + .writeBuilder(EncryptedFiles.plainAsEncryptedOutput(outputFile)) + .schema(variantSchema) + .setAll(ImmutableMap.of(TableProperties.PARQUET_SHRED_VARIANTS, "true")) + .content(FileContent.DATA) + .build()) { + // Even with shredding property set, null variantAnalyzer means no BufferedFileAppender + assertThat(appender).isNotInstanceOf(BufferedFileAppender.class); + } + } + + @Test + public void testFormatModelVariantShreddingRoundTrip() throws IOException { + Schema variantSchema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "v", Types.VariantType.get())); + + VariantShreddingAnalyzer analyzer = + new VariantShreddingAnalyzer() { + @Override + protected List extractVariantValues(List rows, int idx) { + List values = Lists.newArrayList(); + for (Record row : rows) { + Object obj = row.get(idx); + if (obj instanceof Variant) { + values.add(((Variant) obj).value()); + } + } + return values; + } + + @Override + protected int resolveColumnIndex(Void engineSchema, String columnName) { + // GenericRecord uses schema column order + return variantSchema.columns().indexOf(variantSchema.findField(columnName)); + } + }; + + ByteBuffer metadataBuffer = VariantTestUtil.createMetadata(ImmutableList.of("a", "b"), true); + VariantMetadata metadata = Variants.metadata(metadataBuffer); + ByteBuffer objectBuffer = + VariantTestUtil.createObject( + metadataBuffer, + ImmutableMap.of( + "a", Variants.of(42), + "b", Variants.of("hello"))); + Variant variant = Variant.of(metadata, Variants.value(metadata, objectBuffer)); + + GenericRecord record = GenericRecord.create(variantSchema); + List variantRecords = + ImmutableList.of( + record.copy(ImmutableMap.of("id", 1L, "v", variant)), + record.copy(ImmutableMap.of("id", 2L, "v", variant)), + record.copy(ImmutableMap.of("id", 3L, "v", variant))); + + OutputFile outputFile = Files.localOutput(createTempFile(temp)); + + ParquetFormatModel> model = + ParquetFormatModel.create( + Record.class, + Void.class, + (icebergSchema, messageType, engineSchema) -> + GenericParquetWriter.create(icebergSchema, messageType), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericParquetReaders.buildReader(icebergSchema, fileSchema), + analyzer, + record1 -> record1); + + try (FileAppender appender = + model + .writeBuilder(EncryptedFiles.plainAsEncryptedOutput(outputFile)) + .schema(variantSchema) + .setAll( + ImmutableMap.of( + TableProperties.PARQUET_SHRED_VARIANTS, "true", + TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "2")) + .content(FileContent.DATA) + .build()) { + assertThat(appender).isInstanceOf(BufferedFileAppender.class); + for (Record rec : variantRecords) { + appender.add(rec); + } + } + + // Verify shredded Parquet schema + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetIO.file(outputFile.toInputFile()))) { + MessageType parquetSchema = reader.getFooter().getFileMetaData().getSchema(); + GroupType variantGroup = parquetSchema.getType("v").asGroupType(); + assertThat(variantGroup.containsField("metadata")).isTrue(); + assertThat(variantGroup.containsField("value")).isTrue(); + assertThat(variantGroup.containsField("typed_value")).isTrue(); + + GroupType typedValue = variantGroup.getType("typed_value").asGroupType(); + assertThat(typedValue.containsField("a")).isTrue(); + assertThat(typedValue.containsField("b")).isTrue(); + } + + // Verify data is in typed columns by reading raw Parquet groups + try (ParquetReader rawReader = + ParquetReader.builder( + new GroupReadSupport(), new org.apache.hadoop.fs.Path(outputFile.location())) + .build()) { + Group row = rawReader.read(); + Group variantData = row.getGroup("v", 0); + + assertThat(variantData.getFieldRepetitionCount("value")) + .as("value should be absent when fully shredded") + .isEqualTo(0); + + Group typedValue = variantData.getGroup("typed_value", 0); + assertThat(typedValue.getGroup("a", 0).getInteger("typed_value", 0)) + .as("typed_value.a should contain 42") + .isEqualTo(42); + assertThat(typedValue.getGroup("b", 0).getString("typed_value", 0)) + .as("typed_value.b should contain hello") + .isEqualTo("hello"); + } + + // Verify data round-trips + List writtenRecords; + try (CloseableIterable reader = + Parquet.read(outputFile.toInputFile()) + .project(variantSchema) + .createReaderFunc( + fileSchema -> GenericParquetReaders.buildReader(variantSchema, fileSchema)) + .build()) { + writtenRecords = Lists.newArrayList(reader); + } + + assertThat(writtenRecords).hasSameSizeAs(variantRecords); + for (int i = 0; i < variantRecords.size(); i++) { + InternalTestHelpers.assertEquals( + variantSchema.asStruct(), variantRecords.get(i), writtenRecords.get(i)); + } + } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantShreddingAnalyzer.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantShreddingAnalyzer.java new file mode 100644 index 000000000000..5ac10f74cc51 --- /dev/null +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestVariantShreddingAnalyzer.java @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.math.BigDecimal; +import java.util.List; +import java.util.Locale; +import java.util.function.Function; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.variants.ShreddedObject; +import org.apache.iceberg.variants.ValueArray; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.iceberg.variants.Variants; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.junit.jupiter.api.Test; + +public class TestVariantShreddingAnalyzer { + + private static class DirectAnalyzer extends VariantShreddingAnalyzer { + @Override + protected List extractVariantValues(List rows, int idx) { + return rows; + } + + @Override + protected int resolveColumnIndex(Void engineSchema, String columnName) { + throw new UnsupportedOperationException("Not used in direct tests"); + } + } + + @Test + public void testDepthLimitStopsObjectRecursion() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // Each level has {"a": , "x": 1} so objects always have a shreddable primitive + VariantMetadata meta = Variants.metadata("a", "x"); + ShreddedObject innermost = Variants.object(meta); + innermost.put("a", Variants.of(42)); + innermost.put("x", Variants.of(1)); + + for (int i = 0; i < 54; i++) { + ShreddedObject wrapper = Variants.object(meta); + wrapper.put("a", innermost); + wrapper.put("x", Variants.of(1)); + innermost = wrapper; + } + + Type schema = analyzer.analyzeAndCreateSchema(List.of(innermost), 0); + assertThat(schema).isNotNull(); + assertThat(schema.getName()).isEqualTo("typed_value"); + + int shreddedDepth = countObjectDepth(schema); + assertThat(shreddedDepth).isLessThanOrEqualTo(50).isGreaterThan(0); + } + + @Test + public void testDepthLimitStopsArrayRecursion() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 55-level nested arrays with a primitive only at the very bottom. + // Depth limit (50) prevents reaching the leaf, so schema is null (graceful degradation). + VariantValue innermost = Variants.of(42); + for (int i = 0; i < 55; i++) { + ValueArray wrapper = Variants.array(); + wrapper.add(innermost); + innermost = wrapper; + } + + Type schema = analyzer.analyzeAndCreateSchema(List.of(innermost), 0); + assertThat(schema).isNull(); + } + + @Test + public void testArrayWithinDepthLimit() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 5-level nested arrays + VariantValue innermost = Variants.of(42); + for (int i = 0; i < 5; i++) { + ValueArray wrapper = Variants.array(); + wrapper.add(innermost); + innermost = wrapper; + } + + Type schema = analyzer.analyzeAndCreateSchema(List.of(innermost), 0); + assertThat(schema).isNotNull(); + assertThat(schema.getName()).isEqualTo("typed_value"); + + int arrayDepth = countArrayDepth(schema); + assertThat(arrayDepth).isEqualTo(5); + } + + @Test + public void testIntermediateFieldCapLimitsTrackedFields() { + int numFields = 1500; + String[] fieldNames = new String[numFields]; + for (int i = 0; i < numFields; i++) { + fieldNames[i] = String.format(Locale.ROOT, "field_%04d", i); + } + + VariantMetadata meta = Variants.metadata(fieldNames); + ShreddedObject obj = Variants.object(meta); + for (String name : fieldNames) { + obj.put(name, Variants.of(42)); + } + + DirectAnalyzer analyzer = new DirectAnalyzer(); + Type schema = analyzer.analyzeAndCreateSchema(List.of(obj), 0); + + assertThat(schema).isNotNull(); + assertThat(schema).isInstanceOf(GroupType.class); + GroupType typedValue = (GroupType) schema; + assertThat(typedValue.getFieldCount()).isLessThanOrEqualTo(300).isGreaterThan(0); + } + + @Test + public void testFieldCapAllowsExistingFieldUpdates() { + int numFields = 1500; + String[] fieldNames = new String[numFields]; + for (int i = 0; i < numFields; i++) { + fieldNames[i] = String.format(Locale.ROOT, "field_%04d", i); + } + + VariantMetadata meta = Variants.metadata(fieldNames); + + ShreddedObject row1 = Variants.object(meta); + for (String name : fieldNames) { + row1.put(name, Variants.of(42)); + } + + ShreddedObject row2 = Variants.object(meta); + for (int i = 0; i < 10; i++) { + row2.put(fieldNames[i], Variants.of("text")); + } + + ShreddedObject row3 = Variants.object(meta); + for (int i = 0; i < 10; i++) { + row3.put(fieldNames[i], Variants.of(99)); + } + + DirectAnalyzer analyzer = new DirectAnalyzer(); + Type schema = analyzer.analyzeAndCreateSchema(List.of(row1, row2, row3), 0); + + assertThat(schema).isNotNull(); + assertThat(schema).isInstanceOf(GroupType.class); + GroupType typedValue = (GroupType) schema; + assertThat(typedValue.getFieldCount()).isGreaterThan(0).isLessThanOrEqualTo(300); + } + + @Test + public void testNestedObjectsWithinDepthLimit() { + VariantMetadata cityMeta = Variants.metadata("city"); + ShreddedObject city = Variants.object(cityMeta); + city.put("city", Variants.of("NYC")); + + VariantMetadata addrMeta = Variants.metadata("address"); + ShreddedObject addr = Variants.object(addrMeta); + addr.put("address", city); + + VariantMetadata rootMeta = Variants.metadata("user"); + ShreddedObject root = Variants.object(rootMeta); + root.put("user", addr); + + DirectAnalyzer analyzer = new DirectAnalyzer(); + Type schema = analyzer.analyzeAndCreateSchema(List.of(root), 0); + + assertThat(schema).isNotNull(); + GroupType rootTv = schema.asGroupType(); + assertThat(rootTv.getName()).isEqualTo("typed_value"); + + // user -> typed_value -> address -> typed_value -> city -> typed_value (STRING) + GroupType userGroup = rootTv.getType("user").asGroupType(); + assertThat(userGroup.containsField("value")).isTrue(); + assertThat(userGroup.containsField("typed_value")).isTrue(); + + GroupType addrTv = userGroup.getType("typed_value").asGroupType(); + GroupType addrGroup = addrTv.getType("address").asGroupType(); + assertThat(addrGroup.containsField("typed_value")).isTrue(); + + GroupType cityTv = addrGroup.getType("typed_value").asGroupType(); + GroupType cityGroup = cityTv.getType("city").asGroupType(); + assertThat(cityGroup.containsField("typed_value")).isTrue(); + + PrimitiveType cityPrimitive = cityGroup.getType("typed_value").asPrimitiveType(); + assertThat(cityPrimitive.getPrimitiveTypeName()) + .isEqualTo(PrimitiveType.PrimitiveTypeName.BINARY); + assertThat(cityPrimitive.getLogicalTypeAnnotation()) + .isEqualTo(LogicalTypeAnnotation.stringType()); + } + + @Test + public void testDecimalForExceedingPrecision() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + // Value 1: 30 integer digits, 0 fractional -> precision=30, scale=0, intDigits=30 + // Value 2: 1 integer digit, 20 fractional -> precision=21, scale=20, intDigits=1 + // Combined: maxIntDigits=30, maxScale=20, raw sum=50 -> capped to precision=38, + // scale=min(20, 38-30)=8 (integer digits get priority) + VariantMetadata meta = Variants.metadata("val"); + ShreddedObject row1 = Variants.object(meta); + row1.put("val", Variants.of(new BigDecimal("123456789012345678901234567890"))); + + ShreddedObject row2 = Variants.object(meta); + row2.put("val", Variants.of(new BigDecimal("1.23456789012345678901"))); + + Type schema = analyzer.analyzeAndCreateSchema(List.of(row1, row2), 0); + assertThat(schema).isNotNull(); + + GroupType typedValue = schema.asGroupType(); + GroupType valGroup = typedValue.getType("val").asGroupType(); + PrimitiveType valPrimitive = valGroup.getType("typed_value").asPrimitiveType(); + + LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = + (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) + valPrimitive.getLogicalTypeAnnotation(); + assertThat(decimal).isNotNull(); + assertThat(decimal.getPrecision()).isEqualTo(38); + // With 30 integer digits, scale is capped to 38 - 30 = 8 (integer digits get priority) + assertThat(decimal.getScale()).isEqualTo(8); + assertThat(decimal.getScale()).isLessThanOrEqualTo(decimal.getPrecision()); + + // Physical type should be FIXED_LEN_BYTE_ARRAY since precision > 18 + assertThat(valPrimitive.getPrimitiveTypeName()) + .isEqualTo(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); + } + + @Test + public void testDecimalForExactPrecision() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // Value with exactly precision=38: 20 integer digits + 18 scale = 38 + VariantMetadata meta = Variants.metadata("val"); + ShreddedObject row = Variants.object(meta); + row.put("val", Variants.of(new BigDecimal("12345678901234567890.123456789012345678"))); + + Type schema = analyzer.analyzeAndCreateSchema(List.of(row), 0); + assertThat(schema).isNotNull(); + + GroupType typedValue = schema.asGroupType(); + GroupType valGroup = typedValue.getType("val").asGroupType(); + PrimitiveType valPrimitive = valGroup.getType("typed_value").asPrimitiveType(); + + LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimal = + (LogicalTypeAnnotation.DecimalLogicalTypeAnnotation) + valPrimitive.getLogicalTypeAnnotation(); + assertThat(decimal.getPrecision()).isEqualTo(38); + assertThat(decimal.getScale()).isEqualTo(18); + } + + @Test + public void testInfrequentFieldsArePruned() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 100 rows: "common" in all, "rare" in only 5 (below MIN_FIELD_FREQUENCY = 0.10) + List rows = buildPruningTestRows(5, obj -> obj); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType group = schema.asGroupType(); + assertThat(group.containsField("common")).isTrue(); + assertThat(group.containsField("rare")).isFalse(); + } + + @Test + public void testEmptyArrayReturnsNull() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // All rows are empty arrays, no element type to infer + List rows = List.of(Variants.array(), Variants.array(), Variants.array()); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNull(); + } + + @Test + public void testRootPrimitiveProducesTypedValue() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // root type is primitive + List rows = List.of(Variants.of("hello"), Variants.of("world"), Variants.of("x")); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + assertThat(schema.getName()).isEqualTo("typed_value"); + assertThat(schema.isPrimitive()).isTrue(); + assertThat(schema.asPrimitiveType().getLogicalTypeAnnotation()) + .isEqualTo(LogicalTypeAnnotation.stringType()); + } + + @Test + public void testRootArrayOfObjectsPrunesInfrequentFields() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + // 100 arrays: "common" in all, "rare" in only 3 (below MIN_FIELD_FREQUENCY = 0.10) + List rows = + buildPruningTestRows( + 3, + obj -> { + ValueArray arr = Variants.array(); + arr.add(obj); + return arr; + }); + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType listType = schema.asGroupType(); + assertThat(listType.getLogicalTypeAnnotation()) + .isInstanceOf(LogicalTypeAnnotation.ListLogicalTypeAnnotation.class); + GroupType repeatedGroup = listType.getType(0).asGroupType(); + GroupType elementGroup = repeatedGroup.getType(0).asGroupType(); + assertThat(elementGroup.containsField("typed_value")).isTrue(); + GroupType objectFields = elementGroup.getType("typed_value").asGroupType(); + assertThat(objectFields.containsField("common")).isTrue(); + assertThat(objectFields.containsField("rare")).isFalse(); + } + + @Test + public void testObjectWithArrayChildPrunesNestedFields() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + VariantMetadata itemMeta = Variants.metadata("name", "rare"); + VariantMetadata rootMeta = Variants.metadata("items"); + + // 100 rows, "rare" appears in only 3 rows (below MIN_FIELD_FREQUENCY = 0.10) + List rows = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + ShreddedObject item = Variants.object(itemMeta); + item.put("name", Variants.of("item_" + i)); + if (i < 3) { + item.put("rare", Variants.of(1)); + } + ValueArray arr = Variants.array(); + arr.add(item); + ShreddedObject root = Variants.object(rootMeta); + root.put("items", arr); + rows.add(root); + } + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType rootTv = schema.asGroupType(); + GroupType itemsGroup = rootTv.getType("items").asGroupType(); + assertThat(itemsGroup.containsField("typed_value")).isTrue(); + GroupType listType = itemsGroup.getType("typed_value").asGroupType(); + GroupType repeatedGroup = listType.getType(0).asGroupType(); + GroupType elementGroup = repeatedGroup.getType(0).asGroupType(); + assertThat(elementGroup.containsField("typed_value")).isTrue(); + GroupType elementFields = elementGroup.getType("typed_value").asGroupType(); + assertThat(elementFields.containsField("name")).isTrue(); + assertThat(elementFields.containsField("rare")).isFalse(); + } + + @Test + public void testLongArrayInFewRowsSurvivesPruning() { + DirectAnalyzer analyzer = new DirectAnalyzer(); + + VariantMetadata itemMeta = Variants.metadata("key"); + + // 2 of 100 rows have 500-element arrays with {"key": N}. Per-element counting gives + // observationCount=1000, so key survives the 10% pruning threshold. + List rows = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + ValueArray arr = Variants.array(); + if (i < 2) { + for (int j = 0; j < 500; j++) { + ShreddedObject item = Variants.object(itemMeta); + item.put("key", Variants.of(j)); + arr.add(item); + } + } else { + arr.add(Variants.of("no_key")); + } + rows.add(arr); + } + + Type schema = analyzer.analyzeAndCreateSchema(rows, 0); + assertThat(schema).isNotNull(); + + GroupType listType = schema.asGroupType(); + GroupType repeatedGroup = listType.getType(0).asGroupType(); + GroupType elementGroup = repeatedGroup.getType(0).asGroupType(); + assertThat(elementGroup.containsField("typed_value")).isTrue(); + GroupType elementFields = elementGroup.getType("typed_value").asGroupType(); + assertThat(elementFields.containsField("key")).isTrue(); + } + + /** + * Builds 100 variant rows where "common" appears in every row and "rare" appears in only {@code + * rareCount} rows (below MIN_FIELD_FREQUENCY = 0.10 when rareCount < 10). + */ + private static List buildPruningTestRows( + int rareCount, Function wrap) { + VariantMetadata meta = Variants.metadata("common", "rare"); + List rows = Lists.newArrayList(); + for (int i = 0; i < 100; i++) { + ShreddedObject obj = Variants.object(meta); + obj.put("common", Variants.of(i)); + if (i < rareCount) { + obj.put("rare", Variants.of("text")); + } + rows.add(wrap.apply(obj)); + } + return rows; + } + + /** Count typed_value group nesting depth along field "a". */ + private static int countObjectDepth(Type type) { + int depth = 0; + Type current = type; + while (current != null && "typed_value".equals(current.getName()) && !current.isPrimitive()) { + depth++; + GroupType group = current.asGroupType(); + if (group.containsField("a")) { + GroupType fieldGroup = group.getType("a").asGroupType(); + if (fieldGroup.containsField("typed_value")) { + current = fieldGroup.getType("typed_value"); + } else { + break; + } + } else { + break; + } + } + return depth; + } + + /** Count nested array (LIST) levels in the schema. */ + private static int countArrayDepth(Type type) { + int depth = 0; + Type current = type; + while (current != null && !current.isPrimitive()) { + if (!"typed_value".equals(current.getName())) { + break; + } + GroupType group = current.asGroupType(); + if (!(group.getLogicalTypeAnnotation() + instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation)) { + break; + } + depth++; + GroupType listGroup = group.getType(0).asGroupType(); + GroupType elementGroup = listGroup.getType(0).asGroupType(); + if (elementGroup.containsField("typed_value")) { + current = elementGroup.getType("typed_value"); + } else { + break; + } + } + return depth; + } +} diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index 161f09d53e2c..af549dfd8e7a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -114,4 +114,12 @@ private SparkSQLProperties() {} public static final String ASYNC_MICRO_BATCH_PLANNING_ENABLED = "spark.sql.iceberg.async-micro-batch-planning-enabled"; public static final boolean ASYNC_MICRO_BATCH_PLANNING_ENABLED_DEFAULT = false; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "spark.sql.iceberg.shred-variants"; + + // Controls the buffer size for variant schema inference during writes + // This determines how many rows are buffered before inferring shredded schema + public static final String VARIANT_INFERENCE_BUFFER_SIZE = + "spark.sql.iceberg.variant-inference-buffer-size"; } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index 2296c076f0c4..80f93427805a 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -33,6 +33,8 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; +import static org.apache.iceberg.TableProperties.PARQUET_VARIANT_BUFFER_SIZE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; import java.util.Locale; @@ -504,6 +506,14 @@ private Map dataWriteProperties() { if (parquetCompressionLevel != null) { writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); } + boolean shouldShredVariants = shredVariants(); + writeProperties.put(PARQUET_SHRED_VARIANTS, String.valueOf(shouldShredVariants)); + + // Add variant shredding configuration properties + if (shouldShredVariants) { + writeProperties.put( + PARQUET_VARIANT_BUFFER_SIZE, String.valueOf(variantInferenceBufferSize())); + } break; case AVRO: @@ -724,4 +734,24 @@ public DeleteGranularity deleteGranularity() { .defaultValue(DeleteGranularity.FILE) .parse(); } + + public boolean shredVariants() { + return confParser + .booleanConf() + .option(SparkWriteOptions.SHRED_VARIANTS) + .sessionConf(SparkSQLProperties.SHRED_VARIANTS) + .tableProperty(TableProperties.PARQUET_SHRED_VARIANTS) + .defaultValue(TableProperties.PARQUET_SHRED_VARIANTS_DEFAULT) + .parse(); + } + + public int variantInferenceBufferSize() { + return confParser + .intConf() + .option(SparkWriteOptions.VARIANT_INFERENCE_BUFFER_SIZE) + .sessionConf(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE) + .tableProperty(TableProperties.PARQUET_VARIANT_BUFFER_SIZE) + .defaultValue(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT) + .parse(); + } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 2b88d2bb1e44..621db891d46c 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -86,4 +86,10 @@ private SparkWriteOptions() {} // Overrides the delete granularity public static final String DELETE_GRANULARITY = "delete-granularity"; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "shred-variants"; + + // Controls the buffer size for variant schema inference during writes + public static final String VARIANT_INFERENCE_BUFFER_SIZE = "variant-inference-buffer-size"; } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java index 23fbe54a4be3..5b7862116aea 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -51,7 +51,9 @@ public static void register() { StructType.class, SparkParquetWriters::buildWriter, (icebergSchema, fileSchema, engineSchema, idToConstant) -> - SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); + SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant), + new SparkVariantShreddingAnalyzer(), + InternalRow::copy)); FormatModelRegistry.register( ParquetFormatModel.create( diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java new file mode 100644 index 000000000000..2c08c662c9da --- /dev/null +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.VariantVal; + +/** + * Spark-specific implementation that extracts variant values from {@link InternalRow} instances. + */ +class SparkVariantShreddingAnalyzer extends VariantShreddingAnalyzer { + + SparkVariantShreddingAnalyzer() {} + + @Override + protected int resolveColumnIndex(StructType sparkSchema, String columnName) { + try { + return sparkSchema.fieldIndex(columnName); + } catch (IllegalArgumentException e) { + return -1; + } + } + + @Override + protected List extractVariantValues( + List bufferedRows, int variantFieldIndex) { + List values = Lists.newArrayList(); + + for (InternalRow row : bufferedRows) { + if (!row.isNullAt(variantFieldIndex)) { + VariantVal variantVal = row.getVariant(variantFieldIndex); + if (variantVal != null) { + VariantValue variantValue = + VariantValue.from( + VariantMetadata.from( + ByteBuffer.wrap(variantVal.getMetadata()).order(ByteOrder.LITTLE_ENDIAN)), + ByteBuffer.wrap(variantVal.getValue()).order(ByteOrder.LITTLE_ENDIAN)); + values.add(variantValue); + } + } + } + + return values; + } +} diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index 383a21087d7f..336067c31235 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -34,6 +34,7 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; @@ -345,6 +346,8 @@ public void testSparkConfOverride() { TableProperties.DELETE_PARQUET_COMPRESSION, "snappy"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -467,6 +470,8 @@ public void testDataPropsDefaultsAsDeleteProps() { PARQUET_COMPRESSION_LEVEL, "5"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -538,6 +543,8 @@ public void testDeleteFileWriteConf() { DELETE_PARQUET_COMPRESSION_LEVEL, "6"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -698,4 +705,81 @@ private void checkMode(DistributionMode expectedMode, SparkWriteConf writeConf) assertThat(writeConf.copyOnWriteDistributionMode(MERGE)).isEqualTo(expectedMode); assertThat(writeConf.positionDeltaDistributionMode(MERGE)).isEqualTo(expectedMode); } + + @TestTemplate + public void testShredVariantsDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.shredVariants()).isFalse(); + } + + @TestTemplate + public void testVariantInferenceBufferSizeDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.variantInferenceBufferSize()) + .isEqualTo(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT); + } + + @TestTemplate + public void testVariantInferenceBufferSizeTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "500").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(500); + } + + @TestTemplate + public void testShredVariantsSessionOverridesTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "false").commit(); + + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "true"), + () -> { + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testShredVariantsWriteOptionOverridesSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "false"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = + new SparkWriteConf( + spark, + table, + new CaseInsensitiveStringMap( + ImmutableMap.of(SparkWriteOptions.SHRED_VARIANTS, "true"))); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testVariantInferenceBufferSizeSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "250"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(250); + }); + } + + @TestTemplate + public void testWritePropertiesIncludeVariantShredding() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "true").commit(); + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table); + Map writeProperties = writeConf.writeProperties(); + assertThat(writeProperties).containsEntry(PARQUET_SHRED_VARIANTS, "true"); + assertThat(writeProperties).containsEntry(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200"); + } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java new file mode 100644 index 000000000000..8cdcf22e5817 --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java @@ -0,0 +1,1101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.variant; + +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.parquet.schema.Types.optional; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.InetAddress; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkSQLProperties; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.variants.Variant; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.internal.SQLConf; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestVariantShredding extends CatalogTestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get())); + + private static final Schema SCHEMA2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get()), + Types.NestedField.optional(3, "metadata", Types.VariantType.get())); + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + protected static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + }; + } + + @BeforeAll + public static void startMetastoreAndSpark() { + // First call parent to initialize metastore and spark with local[2] + CatalogTestBase.startMetastoreAndSpark(); + + // Now stop and recreate spark with local[1] to write all rows to a single file + if (spark != null) { + spark.stop(); + } + + spark = + SparkSession.builder() + .master("local[1]") // Use one thread to write the rows to a single parquet file + .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) + .enableHiveSupport() + .getOrCreate(); + + sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } + + @BeforeEach + public void before() { + super.before(); + validationCatalog.createTable( + tableIdent, SCHEMA, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + } + + @AfterEach + public void after() { + spark.conf().unset(SparkSQLProperties.SHRED_VARIANTS); + spark.conf().unset(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE); + validationCatalog.dropTable(tableIdent, true); + } + + @TestTemplate + public void testVariantShreddingDisabled() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + String values = "(1, parse_json('{\"city\": \"NYC\", \"zip\": 10001}')), (2, null)"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testExcludingNullValue() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30, "dummy": null}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInconsistentType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"age": "25"}')),\ + (2, parse_json('{"age": 30}')),\ + (3, parse_json('{"age": "35"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT variant_get(address, '$.age', 'int') FROM %s WHERE id = 2", tableName); + assertThat(rows).hasSize(1); + assertThat(rows.get(0)[0]).isEqualTo(30); + } + + @TestTemplate + public void testPrimitiveType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = "(1, parse_json('123')), (2, parse_json('456')), (3, parse_json('789'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(16, true))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testPrimitiveDecimalType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + "(1, parse_json('123.56')), (2, parse_json('\"abc\"')), (3, parse_json('12.56'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testBooleanType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"active": true}')),\ + (2, parse_json('{"active": false}')),\ + (3, parse_json('{"active": true}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType active = field("active", shreddedPrimitive(PrimitiveType.PrimitiveTypeName.BOOLEAN)); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(active)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithInconsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.456789}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(6, 9))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithConsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.45}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('["java", "scala", "python"]')),\ + (2, parse_json('["rust", "go"]')),\ + (3, parse_json('["javascript"]'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType arr = + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType()))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, arr); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"tags": ["rust", "go"]}')),\ + (3, parse_json('{"tags": ["javascript"]}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedObjectType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"location": {"city": "Seattle", "zip": 98101}, "tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"location": {"city": "Portland", "zip": 97201}}')),\ + (3, parse_json('{"location": {"city": "NYC", "zip": 10001}}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType zip = + field( + "zip", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(32, true))); + GroupType location = field("location", objectFields(city, zip)); + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(location, tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testLazyInitializationWithBufferedRows() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "5"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}')),\ + (4, parse_json('{"name": "David", "age": 28}')),\ + (5, parse_json('{"name": "Eve", "age": 32}')),\ + (6, parse_json('{"name": "Frank", "age": 40}')),\ + (7, parse_json('{"name": "Grace", "age": 27}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(7); + } + + @TestTemplate + public void testMultipleRowGroups() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int numRows = 1000; + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= numRows; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + valuesBuilder.append( + String.format("(%d, parse_json('{\"name\": \"User%d\", \"age\": %d}'))", i, i, 20 + i)); + } + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 1024); + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(numRows); + } + + @TestTemplate + public void testColumnIndexTruncateLength() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int customTruncateLength = 10; + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, "parquet.columnindex.truncate.length", customTruncateLength); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + String longValue = "A".repeat(20); + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"description\": \"%s\", \"id\": %d}'))", i, longValue, i)); + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType description = + field( + "description", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType id = + field( + "id", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(description, id)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(10); + } + + @TestTemplate + public void testIntegerFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Mix of INT8, INT16, INT32, INT64 - should promote to INT64 + String values = + """ + (1, parse_json('{"value": 10}')),\ + (2, parse_json('{"value": 1000}')),\ + (3, parse_json('{"value": 100000}')),\ + (4, parse_json('{"value": 10000000000}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT64, LogicalTypeAnnotation.intType(64, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Test that they get promoted to the most capable decimal type observed + String values = + """ + (1, parse_json('{"value": 1.5}')),\ + (2, parse_json('{"value": 123.456789}')),\ + (3, parse_json('{"value": 123456789123456.789}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.decimalType(6, 21)) + .named("typed_value")); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDataRoundTripWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify that we can read the data back correctly + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.age', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[0]).isEqualTo(1); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isEqualTo(30); + assertThat(rows.get(1)[0]).isEqualTo(2); + assertThat(rows.get(1)[1]).isEqualTo("Bob"); + assertThat(rows.get(1)[2]).isEqualTo(25); + assertThat(rows.get(2)[0]).isEqualTo(3); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + assertThat(rows.get(2)[2]).isEqualTo(35); + } + + @TestTemplate + public void testMultipleVariantsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Recreate table with SCHEMA2 (address + metadata variant columns) + validationCatalog.dropTable(tableIdent, true); + validationCatalog.createTable( + tableIdent, SCHEMA2, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + + String values = + """ + (1, parse_json('{"city": "NYC"}'), parse_json('{"source": "web"}')),\ + (2, parse_json('{"city": "LA"}'), parse_json('{"source": "app"}')),\ + (3, parse_json('{"city": "SF"}'), parse_json('{"source": "api"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(city)); + + GroupType source = + field( + "source", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType metadata = variant("metadata", 3, Type.Repetition.REQUIRED, objectFields(source)); + MessageType expectedSchema = parquetSchema(address, metadata); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testVariantWithNullValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('null')),\ + (2, parse_json('null')),\ + (3, parse_json('null'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayOfNullElementsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql( + "INSERT INTO %s VALUES (1, parse_json('[null, null, null]')), " + + "(2, parse_json('[null]'))", + tableName); + + // Array elements are all null, element type is null, falls back to unshredded + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedNullAndNonNullVariantValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, null),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(3); + } + + @TestTemplate + public void testWriteOptionOverridesSessionConfig() throws IOException, NoSuchTableException { + // Disable shredding at session level + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + // Enable shredding via per-write option + String query = + "SELECT 1 as id, parse_json('{\"name\": \"Alice\", \"age\": 30}') as address" + + " UNION ALL SELECT 2, parse_json('{\"name\": \"Bob\", \"age\": 25}')" + + " UNION ALL SELECT 3, parse_json('{\"name\": \"Charlie\", \"age\": 35}')"; + spark.sql(query).writeTo(tableName).option("shred-variants", "true").append(); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInfrequentFieldPruning() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "11"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 11; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i == 1) { + // Only the first row has rare_field + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"name\": \"User%d\", \"rare_field\": \"rare\"}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"name\": \"User%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // rare_field appears in 1/11 rows, should be pruned + // name appears in 11/11 rows and should be kept + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedTypeTieBreaking() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "10"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i <= 5) { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": %d}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": \"text%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // 5 ints + 5 strings is a tie so STRING wins (higher TIE_BREAK_PRIORITY) + GroupType val = + field( + "val", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(val)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify data round-trips correctly + List rows = + sql("SELECT id, variant_get(address, '$.val', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(10); + assertThat(rows.get(0)[1]).isEqualTo("1"); + assertThat(rows.get(5)[1]).isEqualTo("text6"); + } + + @TestTemplate + public void testFieldOnlyAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + String values = + """ + (1, parse_json('{"name": "Alice"}')),\ + (2, parse_json('{"name": "Bob"}')),\ + (3, parse_json('{"name": "Charlie"}')),\ + (4, parse_json('{"name": "David", "score": 95}')),\ + (5, parse_json('{"name": "Eve", "score": 88}')),\ + (6, parse_json('{"name": "Frank", "score": 72}')),\ + (7, parse_json('{"name": "Grace", "score": 91}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + // Schema is determined from buffer (rows 1-3) which only has "name". + // "score" is not shredded + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify all data round-trips despite "score" not being shredded + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.score', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(7); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isNull(); + assertThat(rows.get(3)[1]).isEqualTo("David"); + assertThat(rows.get(3)[2]).isEqualTo(95); + assertThat(rows.get(6)[1]).isEqualTo("Grace"); + assertThat(rows.get(6)[2]).isEqualTo(91); + } + + @TestTemplate + public void testCrossFileDifferentShreddedType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // File 1: "score" is always integer → shredded as INT8 + String batch1 = + """ + (1, parse_json('{"score": 95}')),\ + (2, parse_json('{"score": 88}')),\ + (3, parse_json('{"score": 72}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch1); + + // Verify file 1 schema: score shredded as INT8 + Table table = validationCatalog.loadTable(tableIdent); + GroupType scoreInt = + field( + "score", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + MessageType expectedSchema1 = + parquetSchema(variant("address", 2, Type.Repetition.REQUIRED, objectFields(scoreInt))); + verifyParquetSchema(table, expectedSchema1); + + // File 2: "score" is always string → shredded as STRING + String batch2 = + """ + (4, parse_json('{"score": "high"}')),\ + (5, parse_json('{"score": "medium"}')),\ + (6, parse_json('{"score": "low"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch2); + + // Query across both files, reader must handle different shredded types + List rows = + sql("SELECT id, variant_get(address, '$.score', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo("95"); + assertThat(rows.get(1)[1]).isEqualTo("88"); + assertThat(rows.get(3)[1]).isEqualTo("high"); + assertThat(rows.get(5)[1]).isEqualTo("low"); + } + + @TestTemplate + public void testAllNullVariantColumn() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql("INSERT INTO %s VALUES (1, null), (2, null), (3, null)", tableName); + + // All variant values are SQL NULL, so no shredding should occur + Table table = validationCatalog.loadTable(tableIdent); + MessageType expectedSchema = parquetSchema(variant("address", 2, Type.Repetition.OPTIONAL)); + verifyParquetSchema(table, expectedSchema); + + List rows = sql("SELECT id, address FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isNull(); + assertThat(rows.get(1)[1]).isNull(); + assertThat(rows.get(2)[1]).isNull(); + } + + @TestTemplate + public void testBufferSizeOne() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "1"); + + sql( + """ + INSERT INTO %s VALUES + (1, parse_json('{"name": "Alice", "age": 30}')), + (2, parse_json('{"name": "Bob", "age": 25}')), + (3, parse_json('{"name": "Charlie", "age": 35}')) + """, + tableName); + + // Schema inferred from first row only, should still shred name and age + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT id, variant_get(address, '$.name', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + } + + @TestTemplate + public void testDecimalFallbackAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // Buffer: scale=2, 3 integer digits -> DECIMAL(5,2) + // Row 4: precision overflow -> fallback to value field + // Row 5: scale overflow -> fallback to value field + // Row 6: fits typed column, scale widened from 1 to 2 via setScale + String values = + """ + (1, parse_json('{"val": 123.45}')),\ + (2, parse_json('{"val": 678.90}')),\ + (3, parse_json('{"val": 999.99}')),\ + (4, parse_json('{"val": 123456.78}')),\ + (5, parse_json('{"val": 1.2345}')),\ + (6, parse_json('{"val": 12.3}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + List rows = + sql( + "SELECT id, variant_get(address, '$.val', 'decimal(10,4)') FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo(new BigDecimal("123.4500")); + assertThat(rows.get(3)[1]).isEqualTo(new BigDecimal("123456.7800")); + assertThat(rows.get(4)[1]).isEqualTo(new BigDecimal("1.2345")); + assertThat(rows.get(5)[1]).isEqualTo(new BigDecimal("12.3000")); + } + + private void verifyParquetSchema(Table table, MessageType expectedSchema) throws IOException { + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).isNotEmpty(); + + for (FileScanTask task : tasks) { + String path = task.file().location(); + + HadoopInputFile inputFile = HadoopInputFile.fromPath(new Path(path), new Configuration()); + + try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { + MessageType actualSchema = reader.getFileMetaData().getSchema(); + assertThat(actualSchema).isEqualTo(expectedSchema); + } + } + } + } + + private static MessageType parquetSchema(Type... variantTypes) { + return org.apache.parquet.schema.Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT32) + .id(1) + .named("id") + .addFields(variantTypes) + .named("table"); + } + + private static GroupType variant(String name, int fieldId, Type.Repetition repetition) { + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .named(name); + } + + private static GroupType variant( + String name, int fieldId, Type.Repetition repetition, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static Type shreddedPrimitive(PrimitiveType.PrimitiveTypeName primitive) { + return optional(primitive).named("typed_value"); + } + + private static Type shreddedPrimitive( + PrimitiveType.PrimitiveTypeName primitive, LogicalTypeAnnotation annotation) { + return optional(primitive).as(annotation).named("typed_value"); + } + + private static GroupType objectFields(GroupType... fields) { + for (GroupType fieldType : fields) { + checkField(fieldType); + } + + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.OPTIONAL) + .addFields(fields) + .named("typed_value"); + } + + private static GroupType field(String name, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static GroupType element(Type shreddedType) { + return field("element", shreddedType); + } + + private static GroupType list(GroupType elementType) { + return org.apache.parquet.schema.Types.optionalList().element(elementType).named("typed_value"); + } + + private static void checkShreddedType(Type shreddedType) { + Preconditions.checkArgument( + shreddedType.getName().equals("typed_value"), + "Invalid shredded type name: %s should be typed_value", + shreddedType.getName()); + Preconditions.checkArgument( + shreddedType.isRepetition(Type.Repetition.OPTIONAL), + "Invalid shredded type repetition: %s should be OPTIONAL", + shreddedType.getRepetition()); + } + + private static void checkField(GroupType fieldType) { + Preconditions.checkArgument( + fieldType.isRepetition(Type.Repetition.REQUIRED), + "Invalid field type repetition: %s should be REQUIRED", + fieldType.getRepetition()); + } +} From 400ba927de303c12c69b368bbdea209ee2c82c5c Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 6 May 2026 17:12:20 -0400 Subject: [PATCH 169/197] AWS: Fix LICENSE/NOTICE compliance for aws-bundle (#16196) --- aws-bundle/LICENSE | 475 ++++++++++++++++++++++++++++++++++++++++++++- aws-bundle/NOTICE | 5 + 2 files changed, 477 insertions(+), 3 deletions(-) diff --git a/aws-bundle/LICENSE b/aws-bundle/LICENSE index f34a7e250c39..d8484c933f9e 100644 --- a/aws-bundle/LICENSE +++ b/aws-bundle/LICENSE @@ -250,9 +250,8 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Reactive Streams. Project URL: http://reactive-streams.org -License: MIT -| MIT No Attribution -| +License: MIT-0 + | Copyright 2014 Reactive Streams | | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. @@ -325,6 +324,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -367,3 +367,472 @@ This product bundles JCTools (via Netty). Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles the Mozilla Public Suffix List (via Apache HttpComponents). + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + +This product bundles FastDoubleParser (via Jackson JSON Processor, via AWS SDK third-party-jackson-core). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/aws-bundle/NOTICE b/aws-bundle/NOTICE index 45a2fba1b43c..39738b74a297 100644 --- a/aws-bundle/NOTICE +++ b/aws-bundle/NOTICE @@ -332,3 +332,8 @@ This product bundles Netty with the following in its NOTICE file: | * license/LICENSE.brotli4j.txt (Apache License 2.0) | * HOMEPAGE: | * https://github.com/hyperxpro/Brotli4j + +-------------------------------------------------------------------------------- + +This product bundles AWS Analytics Accelerator S3 with the following in its NOTICE file: +| Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. From 334269cd7d5114564a406a3259aa9919e4590738 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 6 May 2026 17:12:46 -0400 Subject: [PATCH 170/197] Azure: Fix LICENSE, NOTICE, and runtime-deps for azure-bundle (#16181) --- azure-bundle/LICENSE | 179 ++++++++++++++++++++++++++++------ azure-bundle/NOTICE | 17 +++- azure-bundle/build.gradle | 10 +- azure-bundle/runtime-deps.txt | 1 - 4 files changed, 172 insertions(+), 35 deletions(-) diff --git a/azure-bundle/LICENSE b/azure-bundle/LICENSE index e8c049f4c33b..b0964f5e65ba 100644 --- a/azure-bundle/LICENSE +++ b/azure-bundle/LICENSE @@ -207,8 +207,7 @@ This product bundles Azure SDK for Java. Project URL: https://github.com/Azure/azure-sdk-for-java License: MIT -| The MIT License (MIT) -| + | Copyright (c) 2015 Microsoft | | Permission is hereby granted, free of charge, to any person obtaining a copy @@ -238,6 +237,91 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- @@ -245,27 +329,53 @@ This product bundles Microsoft Authentication Library for Java. Project URL: https://github.com/AzureAD/microsoft-authentication-library-for-java License: MIT -| MIT License -| -| Copyright (c) Microsoft Corporation. All rights reserved. -| -| Permission is hereby granted, free of charge, to any person obtaining a copy -| of this software and associated documentation files (the "Software"), to deal -| in the Software without restriction, including without limitation the rights -| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -| copies of the Software, and to permit persons to whom the Software is -| furnished to do so, subject to the following conditions: -| -| The above copyright notice and this permission notice shall be included in all -| copies or substantial portions of the Software. -| -| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -| SOFTWARE + +| Copyright (c) Microsoft Corporation. All rights reserved. +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE + +-------------------------------------------------------------------------------- + +This product bundles MSAL4J Persistence Extension. + +Project URL: https://github.com/AzureAD/microsoft-authentication-library-for-java +License: MIT + +| Copyright (c) Microsoft Corporation. All rights reserved. +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE -------------------------------------------------------------------------------- @@ -276,6 +386,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Apache Tomcat Native (netty-tcnative-classes and netty-tcnative-boringssl-static, bundled by Reactor Netty). + +Project URL: https://tomcat.apache.org/native-doc/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Reactor Core. Project URL: https://github.com/reactor/reactor-core @@ -290,9 +407,16 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Reactor AddOns. +This product bundles Reactor Pool (bundled by Reactor Netty). + +Project URL: https://github.com/reactor/reactor-pool +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles Aalto XML (bundled by Azure SDK for Java). -Project URL: https://github.com/reactor/reactor-addons +Project URL: https://github.com/FasterXML/aalto-xml License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 -------------------------------------------------------------------------------- @@ -307,9 +431,8 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Reactive Streams. Project URL: http://www.reactive-streams.org/ -License: MIT -| MIT No Attribution -| +License: MIT-0 + | Copyright 2014 Reactive Streams | | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. diff --git a/azure-bundle/NOTICE b/azure-bundle/NOTICE index 07f87c0bc05c..12325baf97d2 100644 --- a/azure-bundle/NOTICE +++ b/azure-bundle/NOTICE @@ -8,7 +8,6 @@ The Apache Software Foundation (http://www.apache.org/). -------------------------------------------------------------------------------- This product bundles Jackson JSON Processor with the following in its NOTICE file: -| | # Jackson JSON processor | | Jackson is a high-performance, Free/Open Source JSON processing library. @@ -16,6 +15,10 @@ This product bundles Jackson JSON Processor with the following in its NOTICE fil | been in development since 2007. | It is currently developed by a community of developers. | +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| | ## Licensing | | Jackson 2.x core and extension components are licensed under Apache License 2.0 @@ -26,7 +29,17 @@ This product bundles Jackson JSON Processor with the following in its NOTICE fil | A list of contributors may be found from CREDITS(-2.x) file, which is included | in some artifacts (usually source distributions); but is always available | from the source code management (SCM) system project uses. -| +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. -------------------------------------------------------------------------------- diff --git a/azure-bundle/build.gradle b/azure-bundle/build.gradle index dad563b67ab7..fde8adbfc539 100644 --- a/azure-bundle/build.gradle +++ b/azure-bundle/build.gradle @@ -23,6 +23,12 @@ project(":iceberg-azure-bundle") { tasks.jar.dependsOn tasks.shadowJar + configurations { + implementation { + exclude group: 'org.slf4j' + } + } + dependencies { implementation platform(libs.azuresdk.bom) implementation "com.azure:azure-storage-file-datalake" @@ -40,10 +46,6 @@ project(":iceberg-azure-bundle") { include 'NOTICE' } - dependencies { - exclude(dependency('org.slf4j:slf4j-api')) - } - // relocate Azure-specific versions relocate 'io.netty', 'org.apache.iceberg.azure.shaded.io.netty' relocate 'com.fasterxml.jackson', 'org.apache.iceberg.azure.shaded.com.fasterxml.jackson' diff --git a/azure-bundle/runtime-deps.txt b/azure-bundle/runtime-deps.txt index 273146654a31..2e5198f49842 100644 --- a/azure-bundle/runtime-deps.txt +++ b/azure-bundle/runtime-deps.txt @@ -41,4 +41,3 @@ io.projectreactor:reactor-core:3.7.14 net.java.dev.jna:jna-platform:5.17.0 net.java.dev.jna:jna:5.17.0 org.reactivestreams:reactive-streams:1.0.4 -org.slf4j:slf4j-api:2.0.17 From e4028bf6a012691933441387a893eb921a780438 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 6 May 2026 17:13:07 -0400 Subject: [PATCH 171/197] GCP: Fix LICENSE, NOTICE, and runtime-deps for gcp-bundle (#16182) --- gcp-bundle/LICENSE | 1439 +++++++++++++++++++++++++++++++++-- gcp-bundle/NOTICE | 139 ++-- gcp-bundle/build.gradle | 5 +- gcp-bundle/runtime-deps.txt | 1 - 4 files changed, 1455 insertions(+), 129 deletions(-) diff --git a/gcp-bundle/LICENSE b/gcp-bundle/LICENSE index 10c87d69c720..54822a830a72 100644 --- a/gcp-bundle/LICENSE +++ b/gcp-bundle/LICENSE @@ -210,6 +210,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Android Annotations. Project URL: http://source.android.com/ @@ -219,12 +307,14 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Google API Common. +Project URL: https://github.com/googleapis/api-common-java License: BSD 3-Clause + | Copyright 2016, Google Inc. | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -234,7 +324,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -253,12 +343,13 @@ This product bundles Google GAX. Project URL: https://github.com/googleapis/gax-java License: BSD 3-Clause + | Copyright 2016, Google Inc. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -268,7 +359,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -313,24 +404,26 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Google Auth Library. +Project URL: https://github.com/googleapis/google-auth-library-java License: BSD 3-Clause + | Copyright 2014, Google Inc. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above | copyright notice, this list of conditions and the following disclaimer | in the documentation and/or other materials provided with the | distribution. -| +| | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -366,7 +459,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Google Cloud Open-Telemetry Operations Exporters for Java +This product bundles Google Cloud Open-Telemetry Operations Exporters for Java. Project URL: https://github.com/GoogleCloudPlatform/opentelemetry-operations-java License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 @@ -401,6 +494,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List (via Google Guava). + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Http Client. Project URL: https://www.google.com/ @@ -426,12 +900,13 @@ This product bundles Google protobuf. Project URL: https://developers.google.com/protocol-buffers/ License: BSD 3-Clause + | Copyright 2008 Google Inc. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. | * Redistributions in binary form must reproduce the above @@ -441,7 +916,7 @@ License: BSD 3-Clause | * Neither the name of Google Inc. nor the names of its | contributors may be used to endorse or promote products derived from | this software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -453,7 +928,7 @@ License: BSD 3-Clause | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -| +| | Code generated by the Protocol Buffer compiler is owned by the owner | of the input file used when generating it. This code is not | standalone and requires a support library to be linked with it. This @@ -465,27 +940,28 @@ This product bundles Google re2j. Project URL: http://github.com/google/re2j License: Go License + | This is a work derived from Russ Cox's RE2 in Go, whose license | http://golang.org/LICENSE is as follows: -| +| | Copyright (c) 2009 The Go Authors. All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are | met: -| +| | * Redistributions of source code must retain the above copyright | notice, this list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright | notice, this list of conditions and the following disclaimer in | the documentation and/or other materials provided with the | distribution. -| +| | * Neither the name of Google Inc. nor the names of its contributors | may be used to endorse or promote products derived from this | software without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -528,6 +1004,56 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles Apache Tomcat Native (statically linked into netty-tcnative, bundled by gRPC-netty-shaded). + +Project URL: https://tomcat.apache.org/native-doc/ +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles BoringSSL (statically linked into netty-tcnative-boringssl-static, bundled by gRPC-netty-shaded). + +Project URL: https://boringssl.googlesource.com/boringssl/ +License: Apache License, Version 2.0 (with portions under the Go License - BSD 3-Clause) - https://boringssl.googlesource.com/boringssl/+/HEAD/LICENSE + +| +| Licenses for support code +| ------------------------- +| +| Parts of the TLS test suite are under the Go license. This code is not included +| in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so +| distributing code linked against BoringSSL does not trigger this license: +| +| Copyright (c) 2009 The Go Authors. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are +| met: +| +| * Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above +| copyright notice, this list of conditions and the following disclaimer +| in the documentation and/or other materials provided with the +| distribution. +| * Neither the name of Google Inc. nor the names of its +| contributors may be used to endorse or promote products derived from +| this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles OpenCensus. Project URL: https://github.com/census-instrumentation/opencensus-java @@ -546,14 +1072,366 @@ This product bundles javax.annotation-api. Project URL: https://javaee.github.io/glassfish Project URL: http://jcp.org/en/jsr/detail?id=250 -License: CDDL - https://github.com/javaee/javax.annotation/blob/master/LICENSE +License: CDDL 1.1 - https://github.com/javaee/javax.annotation/blob/master/LICENSE + +| COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1 +| +| 1. Definitions. +| +| 1.1. "Contributor" means each individual or entity that creates or +| contributes to the creation of Modifications. +| +| 1.2. "Contributor Version" means the combination of the Original +| Software, prior Modifications used by a Contributor (if any), and +| the Modifications made by that particular Contributor. +| +| 1.3. "Covered Software" means (a) the Original Software, or (b) +| Modifications, or (c) the combination of files containing Original +| Software with files containing Modifications, in each case including +| portions thereof. +| +| 1.4. "Executable" means the Covered Software in any form other than +| Source Code. +| +| 1.5. "Initial Developer" means the individual or entity that first +| makes Original Software available under this License. +| +| 1.6. "Larger Work" means a work which combines Covered Software or +| portions thereof with code not governed by the terms of this License. +| +| 1.7. "License" means this document. +| +| 1.8. "Licensable" means having the right to grant, to the maximum +| extent possible, whether at the time of the initial grant or +| subsequently acquired, any and all of the rights conveyed herein. +| +| 1.9. "Modifications" means the Source Code and Executable form of +| any of the following: +| +| A. Any file that results from an addition to, deletion from or +| modification of the contents of a file containing Original Software +| or previous Modifications; +| +| B. Any new file that contains any part of the Original Software or +| previous Modification; or +| +| C. Any new file that is contributed or otherwise made available +| under the terms of this License. +| +| 1.10. "Original Software" means the Source Code and Executable form +| of computer software code that is originally released under this +| License. +| +| 1.11. "Patent Claims" means any patent claim(s), now owned or +| hereafter acquired, including without limitation, method, process, +| and apparatus claims, in any patent Licensable by grantor. +| +| 1.12. "Source Code" means (a) the common form of computer software +| code in which modifications are made and (b) associated +| documentation included in or with such code. +| +| 1.13. "You" (or "Your") means an individual or a legal entity +| exercising rights under, and complying with all of the terms of, +| this License. For legal entities, "You" includes any entity which +| controls, is controlled by, or is under common control with You. +| For purposes of this definition, "control" means (a) the power, +| direct or indirect, to cause the direction or management of such +| entity, whether by contract or otherwise, or (b) ownership of more +| than fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants. +| +| 2.1. The Initial Developer Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, the Initial Developer +| hereby grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Initial Developer, to use, reproduce, +| modify, display, perform, sublicense and distribute the Original +| Software (or portions thereof), with or without Modifications, +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using or selling +| of Original Software, to make, have made, use, practice, sell, and +| offer for sale, and/or otherwise dispose of the Original Software +| (or portions thereof). +| +| (c) The licenses granted in Sections 2.1(a) and (b) are effective +| on the date Initial Developer first distributes or otherwise makes +| the Original Software available to a third party under the terms of +| this License. +| +| (d) Notwithstanding Section 2.1(b) above, no patent license is +| granted: (1) for code that You delete from the Original Software, +| or (2) for infringements caused by: (i) the modification of the +| Original Software, or (ii) the combination of the Original Software +| with other software or devices. +| +| 2.2. Contributor Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, each Contributor hereby +| grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Contributor to use, reproduce, modify, +| display, perform, sublicense and distribute the Modifications +| created by such Contributor (or portions thereof), either on an +| unmodified basis, with other Modifications, as Covered Software +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using, or selling +| of Modifications made by that Contributor either alone and/or in +| combination with its Contributor Version (or portions of such +| combination), to make, use, sell, offer for sale, have made, and/or +| otherwise dispose of: (1) Modifications made by that Contributor +| (or portions thereof); and (2) the combination of Modifications +| made by that Contributor with its Contributor Version (or portions +| of such combination). +| +| (c) The licenses granted in Sections 2.2(a) and 2.2(b) are +| effective on the date Contributor first makes Commercial Use of the +| Covered Software. +| +| (d) Notwithstanding Section 2.2(b) above, no patent license is +| granted: (1) for any code that Contributor has deleted from the +| Contributor Version; (2) for infringements caused by: (i) third +| party modifications of Contributor Version, or (ii) the combination +| of Modifications made by that Contributor with other software +| (except as part of the Contributor Version) or other devices; or +| (3) under Patent Claims infringed by Covered Software in the +| absence of Modifications made by that Contributor. +| +| 3. Distribution Obligations. +| +| 3.1. Availability of Source Code. +| +| Any Covered Software that You distribute or otherwise make available +| in Executable form must also be made available in Source Code form +| and that Source Code form must be distributed only under the terms +| of this License. You must include a copy of this License with every +| copy of the Source Code form of the Covered Software You distribute +| or otherwise make available. You must inform recipients of any such +| Covered Software in Executable form as to how they can obtain such +| Covered Software in Source Code form in a reasonable manner on or +| through a medium customarily used for software exchange. +| +| 3.2. Modifications. +| +| The Modifications that You create or to which You contribute are +| governed by the terms of this License. You represent that You +| believe Your Modifications are Your original creation(s) and/or You +| have sufficient rights to grant the rights conveyed by this License. +| +| 3.3. Required Notices. +| +| You must include a notice in each of Your Modifications that +| identifies You as the Contributor of the Modification. You may not +| remove or alter any copyright, patent or trademark notices contained +| within the Covered Software, or any notices of licensing or any +| descriptive text giving attribution to any Contributor or the +| Initial Developer. +| +| 3.4. Application of Additional Terms. +| +| You may not offer or impose any terms on any Covered Software in +| Source Code form that alters or restricts the applicable version of +| this License or the recipients' rights hereunder. You may choose to +| offer, and to charge a fee for, warranty, support, indemnity or +| liability obligations to one or more recipients of Covered Software. +| However, you may do so only on Your own behalf, and not on behalf of +| the Initial Developer or any Contributor. You must make it absolutely +| clear that any such warranty, support, indemnity or liability +| obligation is offered by You alone, and You hereby agree to indemnify +| the Initial Developer and every Contributor for any liability +| incurred by the Initial Developer or such Contributor as a result of +| warranty, support, indemnity or liability terms You offer. +| +| 3.5. Distribution of Executable Versions. +| +| You may distribute the Executable form of the Covered Software under +| the terms of this License or under the terms of a license of Your +| choice, which may contain terms different from this License, provided +| that You are in compliance with the terms of this License and that +| the license for the Executable form does not attempt to limit or +| alter the recipient's rights in the Source Code form from the rights +| set forth in this License. If You distribute the Covered Software in +| Executable form under a different license, You must make it +| absolutely clear that any terms which differ from this License are +| offered by You alone, not by the Initial Developer or Contributor. +| You hereby agree to indemnify the Initial Developer and every +| Contributor for any liability incurred by the Initial Developer or +| such Contributor as a result of any such terms You offer. +| +| 3.6. Larger Works. +| +| You may create a Larger Work by combining Covered Software with +| other code not governed by the terms of this License and distribute +| the Larger Work as a single product. In such a case, You must make +| sure the requirements of this License are fulfilled for the Covered +| Software. +| +| 4. Versions of the License. +| +| 4.1. New Versions. +| +| Oracle is the initial license steward and may publish revised and/or +| new versions of this License from time to time. Each version will be +| given a distinguishing version number. Except as provided in Section +| 4.3, no one other than the license steward has the right to modify +| this License. +| +| 4.2. Effect of New Versions. +| +| You may always continue to use, distribute or otherwise make the +| Covered Software available under the terms of the version of the +| License under which You originally received the Covered Software. If +| the Initial Developer includes a notice in the Original Software +| prohibiting it from being distributed or otherwise made available +| under any subsequent version of the License, You must distribute and +| make the Covered Software available under the terms of the version +| of the License under which You originally received the Covered +| Software. Otherwise, You may also choose to use, distribute or +| otherwise make the Covered Software available under the terms of any +| subsequent version of the License published by the license steward. +| +| 4.3. Modified Versions. +| +| When You are an Initial Developer and You want to create a new +| license for Your Original Software, You may create and use a +| modified version of this License if You: (a) rename the license and +| remove any references to the name of the license steward (except to +| note that the modified license differs from this License); and (b) +| otherwise make it clear that the license contains terms which differ +| from this License. +| +| 5. DISCLAIMER OF WARRANTY. +| +| COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, +| WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, +| INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE +| IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR +| NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE +| OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY COVERED SOFTWARE +| PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR +| ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, +| REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN +| ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS +| AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. +| +| 6. TERMINATION. +| +| 6.1. This License and the rights granted hereunder will terminate +| automatically if You fail to comply with terms herein and fail to +| cure such breach within 30 days of becoming aware of the breach. +| Provisions which, by their nature, must remain in effect beyond the +| termination of this License shall survive. +| +| 6.2. If You assert a patent infringement claim (excluding +| declaratory judgment actions) against Initial Developer or a +| Contributor (the Initial Developer or Contributor against whom You +| assert such claim is referred to as "Participant") alleging that the +| Participant Software (meaning the Contributor Version where the +| Participant is a Contributor or the Original Software where the +| Participant is the Initial Developer) directly or indirectly +| infringes any patent, then any and all rights granted directly or +| indirectly to You by such Participant, the Initial Developer (if the +| Initial Developer is not the Participant) and all Contributors under +| Sections 2.1 and/or 2.2 of this License shall, upon 60 days notice +| from Participant terminate prospectively and automatically at the +| expiration of such 60 day notice period, unless if within such 60 +| day period You withdraw Your claim with respect to the Participant +| Software against such Participant either unilaterally or pursuant to +| a written agreement with Participant. +| +| 6.3. If You assert a patent infringement claim against Participant +| alleging that the Participant Software directly or indirectly +| infringes any patent where such claim is resolved (such as by +| license or settlement) prior to the initiation of patent +| infringement litigation, then the reasonable value of the licenses +| granted by such Participant under Sections 2.1 or 2.2 shall be +| taken into account in determining the amount or value of any payment +| or license. +| +| 6.4. In the event of termination under Sections 6.1 or 6.2 above, +| all end user licenses that have been validly granted by You or any +| distributor hereunder prior to termination (excluding licenses +| granted to You by any distributor) shall survive termination. +| +| 7. LIMITATION OF LIABILITY. +| +| UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT +| (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE +| INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF +| COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE +| TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR +| CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT +| LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER +| FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR +| LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE +| POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT +| APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH +| PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH +| LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR +| LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION +| AND LIMITATION MAY NOT APPLY TO YOU. +| +| 8. U.S. GOVERNMENT END USERS. +| +| The Covered Software is a "commercial item," as that term is defined +| in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer +| software" (as that term is defined at 48 C.F.R. 252.227-7014(a)(1)) +| and "commercial computer software documentation" as such terms are +| used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. +| 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all +| U.S. Government End Users acquire Covered Software with only those +| rights set forth herein. This U.S. Government Rights clause is in +| lieu of, and supersedes, any other FAR, DFAR, or other clause or +| provision that addresses Government rights in computer software +| under this License. +| +| 9. MISCELLANEOUS. +| +| This License represents the complete agreement concerning subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. This License shall be governed by +| the law of the jurisdiction specified in a notice contained within +| the Original Software (except to the extent applicable law, if any, +| provides otherwise), excluding such jurisdiction's conflict-of-law +| provisions. Any litigation relating to this License shall be subject +| to the jurisdiction of the courts located in the jurisdiction and +| venue specified in a notice contained within the Original Software, +| with the losing party responsible for costs, including, without +| limitation, court costs and reasonable attorneys' fees and expenses. +| The application of the United Nations Convention on Contracts for +| the International Sale of Goods is expressly excluded. Any law or +| regulation which provides that the language of a contract shall be +| construed against the drafter shall not apply to this License. You +| agree that You alone are responsible for compliance with the United +| States export administration regulations (and the export control +| laws and regulation of any other countries) when You use, distribute +| or otherwise make available any Covered Software. +| +| 10. RESPONSIBILITY FOR CLAIMS. +| +| As between Initial Developer and the Contributors, each party is +| responsible for claims and damages arising, directly or indirectly, +| out of its utilization of rights under this License and You agree to +| work with Initial Developer and Contributors to distribute such +| responsibility on an equitable basis. Nothing herein is intended or +| shall be deemed to constitute any admission of liability. -------------------------------------------------------------------------------- -This product bundles checkerframework checker-qual. +This product bundles checkerframework checker-qual and checker-compat-qual. Project URL: https://checkerframework.org/ License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -563,17 +1441,17 @@ License: MIT | In addition, the cleanroom implementations of third-party annotations, | which the Checker Framework recognizes as aliases for its own | annotations, are licensed under the MIT License. -| +| | Permission is hereby granted, free of charge, to any person obtaining a copy | of this software and associated documentation files (the "Software"), to deal | in the Software without restriction, including without limitation the rights | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | copies of the Software, and to permit persons to whom the Software is | furnished to do so, subject to the following conditions: -| +| | The above copyright notice and this permission notice shall be included in | all copies or substantial portions of the Software. -| +| | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -586,21 +1464,21 @@ License: MIT This product bundles Animal Sniffer Annotations. +Project URL: https://github.com/mojohaus/animal-sniffer License: MIT -| The MIT License -| + | Copyright (c) 2009 codehaus.org. -| +| | Permission is hereby granted, free of charge, to any person obtaining a copy | of this software and associated documentation files (the "Software"), to deal | in the Software without restriction, including without limitation the rights | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | copies of the Software, and to permit persons to whom the Software is | furnished to do so, subject to the following conditions: -| +| | The above copyright notice and this permission notice shall be included in | all copies or substantial portions of the Software. -| +| | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE @@ -622,24 +1500,25 @@ This product bundles ThreeTen BP. Project URL: https://www.threeten.org/threetenbp License: BSD 3-Clause + | Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| +| | All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are met: -| +| | * Redistributions of source code must retain the above copyright notice, | this list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright notice, | this list of conditions and the following disclaimer in the documentation | and/or other materials provided with the distribution. -| +| | * Neither the name of JSR-310 nor the names of its contributors | may be used to endorse or promote products derived from this software | without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -684,26 +1563,27 @@ License: Public Domain - https://github.com/stleary/JSON-java/blob/master/LICENS This product bundles ThreeTen Extra. -Project URL: https://www.threeten.org/threeten-extra -License: BSD 3-clause +Project URL: https://www.threeten.org/threeten-extra +License: BSD 3-Clause + | Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. -| +| | All rights reserved. -| +| | * Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are met: -| +| | * Redistributions of source code must retain the above copyright notice, | this list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright notice, | this list of conditions and the following disclaimer in the documentation | and/or other materials provided with the distribution. -| +| | * Neither the name of JSR-310 nor the names of its contributors | may be used to endorse or promote products derived from this software | without specific prior written permission. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -725,6 +1605,27 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Common Expression Language (CEL) specification (shaded by gRPC-xds). + +Project URL: https://github.com/google/cel-spec +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles xDS data plane API definitions (shaded by gRPC-xds). + +Project URL: https://github.com/cncf/xds +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles UDPA (Universal Data Plane API) definitions (shaded by gRPC-xds). + +Project URL: https://github.com/cncf/udpa +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles JSpecify. Project URL: https://github.com/jspecify/jspecify @@ -736,21 +1637,20 @@ This product bundles Stax2 API. Project URL: http://github.com/FasterXML/stax2-api License: BSD 2-Clause -| BSD 2-Clause License -| + | Copyright (c) 2008+, FasterXML, LLC | All rights reserved. -| +| | Redistribution and use in source and binary forms, with or without | modification, are permitted provided that the following conditions are met: -| +| | * Redistributions of source code must retain the above copyright notice, this | list of conditions and the following disclaimer. -| +| | * Redistributions in binary form must reproduce the above copyright notice, | this list of conditions and the following disclaimer in the documentation | and/or other materials provided with the distribution. -| +| | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -771,7 +1671,444 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles JCTools (via Netty). +This product bundles JCTools (via Netty and OpenTelemetry). Project URL: https://github.com/JCTools/JCTools License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles WeakConcurrentMap (via OpenTelemetry). + +Copyright: 2014 Rafael Winterhalter +Project URL: https://github.com/raphw/weak-lock-free +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This product bundles MSV xsdlib (bundled by Woodstox). + +Project URL: https://github.com/xmlark/msv +License: BSD 3-Clause + +| Copyright 2001-2013 Oracle and/or its affiliates. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| 1. Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| 3. Neither the name of the copyright holder nor the names of its contributors +| may be used to endorse or promote products derived from this software +| without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This product bundles isorelax (bundled by Woodstox). + +Project URL: https://github.com/relaxng/jing-trang +License: CDDL 1.1 + +| COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.1 +| +| 1. Definitions. +| +| 1.1. "Contributor" means each individual or entity that creates or +| contributes to the creation of Modifications. +| +| 1.2. "Contributor Version" means the combination of the Original +| Software, prior Modifications used by a Contributor (if any), and +| the Modifications made by that particular Contributor. +| +| 1.3. "Covered Software" means (a) the Original Software, or (b) +| Modifications, or (c) the combination of files containing Original +| Software with files containing Modifications, in each case including +| portions thereof. +| +| 1.4. "Executable" means the Covered Software in any form other than +| Source Code. +| +| 1.5. "Initial Developer" means the individual or entity that first +| makes Original Software available under this License. +| +| 1.6. "Larger Work" means a work which combines Covered Software or +| portions thereof with code not governed by the terms of this License. +| +| 1.7. "License" means this document. +| +| 1.8. "Licensable" means having the right to grant, to the maximum +| extent possible, whether at the time of the initial grant or +| subsequently acquired, any and all of the rights conveyed herein. +| +| 1.9. "Modifications" means the Source Code and Executable form of +| any of the following: +| +| A. Any file that results from an addition to, deletion from or +| modification of the contents of a file containing Original Software +| or previous Modifications; +| +| B. Any new file that contains any part of the Original Software or +| previous Modification; or +| +| C. Any new file that is contributed or otherwise made available +| under the terms of this License. +| +| 1.10. "Original Software" means the Source Code and Executable form +| of computer software code that is originally released under this +| License. +| +| 1.11. "Patent Claims" means any patent claim(s), now owned or +| hereafter acquired, including without limitation, method, process, +| and apparatus claims, in any patent Licensable by grantor. +| +| 1.12. "Source Code" means (a) the common form of computer software +| code in which modifications are made and (b) associated +| documentation included in or with such code. +| +| 1.13. "You" (or "Your") means an individual or a legal entity +| exercising rights under, and complying with all of the terms of, +| this License. For legal entities, "You" includes any entity which +| controls, is controlled by, or is under common control with You. +| For purposes of this definition, "control" means (a) the power, +| direct or indirect, to cause the direction or management of such +| entity, whether by contract or otherwise, or (b) ownership of more +| than fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants. +| +| 2.1. The Initial Developer Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, the Initial Developer +| hereby grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Initial Developer, to use, reproduce, +| modify, display, perform, sublicense and distribute the Original +| Software (or portions thereof), with or without Modifications, +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using or selling +| of Original Software, to make, have made, use, practice, sell, and +| offer for sale, and/or otherwise dispose of the Original Software +| (or portions thereof). +| +| (c) The licenses granted in Sections 2.1(a) and (b) are effective +| on the date Initial Developer first distributes or otherwise makes +| the Original Software available to a third party under the terms of +| this License. +| +| (d) Notwithstanding Section 2.1(b) above, no patent license is +| granted: (1) for code that You delete from the Original Software, +| or (2) for infringements caused by: (i) the modification of the +| Original Software, or (ii) the combination of the Original Software +| with other software or devices. +| +| 2.2. Contributor Grant. +| +| Conditioned upon Your compliance with Section 3.1 below and subject +| to third party intellectual property claims, each Contributor hereby +| grants You a world-wide, royalty-free, non-exclusive license: +| +| (a) under intellectual property rights (other than patent or +| trademark) Licensable by Contributor to use, reproduce, modify, +| display, perform, sublicense and distribute the Modifications +| created by such Contributor (or portions thereof), either on an +| unmodified basis, with other Modifications, as Covered Software +| and/or as part of a Larger Work; and +| +| (b) under Patent Claims infringed by the making, using, or selling +| of Modifications made by that Contributor either alone and/or in +| combination with its Contributor Version (or portions of such +| combination), to make, use, sell, offer for sale, have made, and/or +| otherwise dispose of: (1) Modifications made by that Contributor +| (or portions thereof); and (2) the combination of Modifications +| made by that Contributor with its Contributor Version (or portions +| of such combination). +| +| (c) The licenses granted in Sections 2.2(a) and 2.2(b) are +| effective on the date Contributor first makes Commercial Use of the +| Covered Software. +| +| (d) Notwithstanding Section 2.2(b) above, no patent license is +| granted: (1) for any code that Contributor has deleted from the +| Contributor Version; (2) for infringements caused by: (i) third +| party modifications of Contributor Version, or (ii) the combination +| of Modifications made by that Contributor with other software +| (except as part of the Contributor Version) or other devices; or +| (3) under Patent Claims infringed by Covered Software in the +| absence of Modifications made by that Contributor. +| +| 3. Distribution Obligations. +| +| 3.1. Availability of Source Code. +| +| Any Covered Software that You distribute or otherwise make available +| in Executable form must also be made available in Source Code form +| and that Source Code form must be distributed only under the terms +| of this License. You must include a copy of this License with every +| copy of the Source Code form of the Covered Software You distribute +| or otherwise make available. You must inform recipients of any such +| Covered Software in Executable form as to how they can obtain such +| Covered Software in Source Code form in a reasonable manner on or +| through a medium customarily used for software exchange. +| +| 3.2. Modifications. +| +| The Modifications that You create or to which You contribute are +| governed by the terms of this License. You represent that You +| believe Your Modifications are Your original creation(s) and/or You +| have sufficient rights to grant the rights conveyed by this License. +| +| 3.3. Required Notices. +| +| You must include a notice in each of Your Modifications that +| identifies You as the Contributor of the Modification. You may not +| remove or alter any copyright, patent or trademark notices contained +| within the Covered Software, or any notices of licensing or any +| descriptive text giving attribution to any Contributor or the +| Initial Developer. +| +| 3.4. Application of Additional Terms. +| +| You may not offer or impose any terms on any Covered Software in +| Source Code form that alters or restricts the applicable version of +| this License or the recipients' rights hereunder. You may choose to +| offer, and to charge a fee for, warranty, support, indemnity or +| liability obligations to one or more recipients of Covered Software. +| However, you may do so only on Your own behalf, and not on behalf of +| the Initial Developer or any Contributor. You must make it absolutely +| clear that any such warranty, support, indemnity or liability +| obligation is offered by You alone, and You hereby agree to indemnify +| the Initial Developer and every Contributor for any liability +| incurred by the Initial Developer or such Contributor as a result of +| warranty, support, indemnity or liability terms You offer. +| +| 3.5. Distribution of Executable Versions. +| +| You may distribute the Executable form of the Covered Software under +| the terms of this License or under the terms of a license of Your +| choice, which may contain terms different from this License, provided +| that You are in compliance with the terms of this License and that +| the license for the Executable form does not attempt to limit or +| alter the recipient's rights in the Source Code form from the rights +| set forth in this License. If You distribute the Covered Software in +| Executable form under a different license, You must make it +| absolutely clear that any terms which differ from this License are +| offered by You alone, not by the Initial Developer or Contributor. +| You hereby agree to indemnify the Initial Developer and every +| Contributor for any liability incurred by the Initial Developer or +| such Contributor as a result of any such terms You offer. +| +| 3.6. Larger Works. +| +| You may create a Larger Work by combining Covered Software with +| other code not governed by the terms of this License and distribute +| the Larger Work as a single product. In such a case, You must make +| sure the requirements of this License are fulfilled for the Covered +| Software. +| +| 4. Versions of the License. +| +| 4.1. New Versions. +| +| Oracle is the initial license steward and may publish revised and/or +| new versions of this License from time to time. Each version will be +| given a distinguishing version number. Except as provided in Section +| 4.3, no one other than the license steward has the right to modify +| this License. +| +| 4.2. Effect of New Versions. +| +| You may always continue to use, distribute or otherwise make the +| Covered Software available under the terms of the version of the +| License under which You originally received the Covered Software. If +| the Initial Developer includes a notice in the Original Software +| prohibiting it from being distributed or otherwise made available +| under any subsequent version of the License, You must distribute and +| make the Covered Software available under the terms of the version +| of the License under which You originally received the Covered +| Software. Otherwise, You may also choose to use, distribute or +| otherwise make the Covered Software available under the terms of any +| subsequent version of the License published by the license steward. +| +| 4.3. Modified Versions. +| +| When You are an Initial Developer and You want to create a new +| license for Your Original Software, You may create and use a +| modified version of this License if You: (a) rename the license and +| remove any references to the name of the license steward (except to +| note that the modified license differs from this License); and (b) +| otherwise make it clear that the license contains terms which differ +| from this License. +| +| 5. DISCLAIMER OF WARRANTY. +| +| COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" BASIS, +| WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, +| INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED SOFTWARE +| IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR +| NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE +| OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY COVERED SOFTWARE +| PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE INITIAL DEVELOPER OR +| ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY SERVICING, +| REPAIR OR CORRECTION. THIS DISCLAIMER OF WARRANTY CONSTITUTES AN +| ESSENTIAL PART OF THIS LICENSE. NO USE OF ANY COVERED SOFTWARE IS +| AUTHORIZED HEREUNDER EXCEPT UNDER THIS DISCLAIMER. +| +| 6. TERMINATION. +| +| 6.1. This License and the rights granted hereunder will terminate +| automatically if You fail to comply with terms herein and fail to +| cure such breach within 30 days of becoming aware of the breach. +| Provisions which, by their nature, must remain in effect beyond the +| termination of this License shall survive. +| +| 6.2. If You assert a patent infringement claim (excluding +| declaratory judgment actions) against Initial Developer or a +| Contributor (the Initial Developer or Contributor against whom You +| assert such claim is referred to as "Participant") alleging that the +| Participant Software (meaning the Contributor Version where the +| Participant is a Contributor or the Original Software where the +| Participant is the Initial Developer) directly or indirectly +| infringes any patent, then any and all rights granted directly or +| indirectly to You by such Participant, the Initial Developer (if the +| Initial Developer is not the Participant) and all Contributors under +| Sections 2.1 and/or 2.2 of this License shall, upon 60 days notice +| from Participant terminate prospectively and automatically at the +| expiration of such 60 day notice period, unless if within such 60 +| day period You withdraw Your claim with respect to the Participant +| Software against such Participant either unilaterally or pursuant to +| a written agreement with Participant. +| +| 6.3. If You assert a patent infringement claim against Participant +| alleging that the Participant Software directly or indirectly +| infringes any patent where such claim is resolved (such as by +| license or settlement) prior to the initiation of patent +| infringement litigation, then the reasonable value of the licenses +| granted by such Participant under Sections 2.1 or 2.2 shall be +| taken into account in determining the amount or value of any payment +| or license. +| +| 6.4. In the event of termination under Sections 6.1 or 6.2 above, +| all end user licenses that have been validly granted by You or any +| distributor hereunder prior to termination (excluding licenses +| granted to You by any distributor) shall survive termination. +| +| 7. LIMITATION OF LIABILITY. +| +| UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT +| (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE +| INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF +| COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE LIABLE +| TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR +| CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT +| LIMITATION, DAMAGES FOR LOSS OF GOODWILL, WORK STOPPAGE, COMPUTER +| FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR +| LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN INFORMED OF THE +| POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF LIABILITY SHALL NOT +| APPLY TO LIABILITY FOR DEATH OR PERSONAL INJURY RESULTING FROM SUCH +| PARTY'S NEGLIGENCE TO THE EXTENT APPLICABLE LAW PROHIBITS SUCH +| LIMITATION. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OR +| LIMITATION OF INCIDENTAL OR CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION +| AND LIMITATION MAY NOT APPLY TO YOU. +| +| 8. U.S. GOVERNMENT END USERS. +| +| The Covered Software is a "commercial item," as that term is defined +| in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial computer +| software" (as that term is defined at 48 C.F.R. 252.227-7014(a)(1)) +| and "commercial computer software documentation" as such terms are +| used in 48 C.F.R. 12.212 (Sept. 1995). Consistent with 48 C.F.R. +| 12.212 and 48 C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all +| U.S. Government End Users acquire Covered Software with only those +| rights set forth herein. This U.S. Government Rights clause is in +| lieu of, and supersedes, any other FAR, DFAR, or other clause or +| provision that addresses Government rights in computer software +| under this License. +| +| 9. MISCELLANEOUS. +| +| This License represents the complete agreement concerning subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. This License shall be governed by +| the law of the jurisdiction specified in a notice contained within +| the Original Software (except to the extent applicable law, if any, +| provides otherwise), excluding such jurisdiction's conflict-of-law +| provisions. Any litigation relating to this License shall be subject +| to the jurisdiction of the courts located in the jurisdiction and +| venue specified in a notice contained within the Original Software, +| with the losing party responsible for costs, including, without +| limitation, court costs and reasonable attorneys' fees and expenses. +| The application of the United Nations Convention on Contracts for +| the International Sale of Goods is expressly excluded. Any law or +| regulation which provides that the language of a contract shall be +| construed against the drafter shall not apply to this License. You +| agree that You alone are responsible for compliance with the United +| States export administration regulations (and the export control +| laws and regulation of any other countries) when You use, distribute +| or otherwise make available any Covered Software. +| +| 10. RESPONSIBILITY FOR CLAIMS. +| +| As between Initial Developer and the Contributors, each party is +| responsible for claims and damages arising, directly or indirectly, +| out of its utilization of rights under this License and You agree to +| work with Initial Developer and Contributors to distribute such +| responsibility on an equitable basis. Nothing herein is intended or +| shall be deemed to constitute any admission of liability. + +-------------------------------------------------------------------------------- + +This product bundles RELAX NG Datatype API (bundled by Woodstox). + +Project URL: https://github.com/relaxng/relaxng-datatype-java +License: BSD 3-Clause + +| Copyright (c) 2018 Oracle and/or its affiliates. All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| - Redistributions of source code must retain the above copyright +| notice, this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright +| notice, this list of conditions and the following disclaimer in the +| documentation and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived +| from this software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +| IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +| THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +| PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +| CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/gcp-bundle/NOTICE b/gcp-bundle/NOTICE index 97eb794e3b72..98d13b8895d9 100644 --- a/gcp-bundle/NOTICE +++ b/gcp-bundle/NOTICE @@ -9,22 +9,37 @@ The Apache Software Foundation (http://www.apache.org/). This product bundles Jackson JSON Processor with the following in its NOTICE file: | # Jackson JSON processor -| +| | Jackson is a high-performance, Free/Open Source JSON processing library. | It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has | been in development since 2007. | It is currently developed by a community of developers. -| +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| | ## Licensing -| +| | Jackson 2.x core and extension components are licensed under Apache License 2.0 | To find the details that apply to this artifact see the accompanying LICENSE file. -| +| | ## Credits -| +| | A list of contributors may be found from CREDITS(-2.x) file, which is included | in some artifacts (usually source distributions); but is always available | from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. -------------------------------------------------------------------------------- @@ -32,51 +47,51 @@ This product bundles Netty with the following in its NOTICE file: | | The Netty Project | ================= -| +| | Please visit the Netty web site for more information: -| +| | * http://netty.io/ -| +| | Copyright 2016 The Netty Project -| +| | The Netty Project licenses this file to you under the Apache License, | version 2.0 (the "License"); you may not use this file except in compliance | with the License. You may obtain a copy of the License at: -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | License for the specific language governing permissions and limitations | under the License. -| +| | ------------------------------------------------------------------------------- | This product contains a forked and modified version of Tomcat Native -| +| | * LICENSE: | * license/LICENSE.tomcat-native.txt (Apache License 2.0) | * HOMEPAGE: | * http://tomcat.apache.org/native-doc/ | * https://svn.apache.org/repos/asf/tomcat/native/ -| +| | This product contains the Maven wrapper scripts from 'Maven Wrapper', that provides an easy way to ensure a user has everything necessary to run the Maven build. -| +| | * LICENSE: | * license/LICENSE.mvn-wrapper.txt (Apache License 2.0) | * HOMEPAGE: | * https://github.com/takari/maven-wrapper -| +| | This product contains small piece of code to support AIX, taken from netbsd. -| +| | * LICENSE: | * license/LICENSE.aix-netbsd.txt (OpenSSL License) | * HOMEPAGE: | * https://ftp.netbsd.org/pub/NetBSD/NetBSD-current/src/crypto/external/bsd/openssl/dist -| -| +| +| | This product contains code from boringssl. -| +| | * LICENSE (Combination ISC and OpenSSL license) | * license/LICENSE.boringssl.txt (Combination ISC and OpenSSL license) | * HOMEPAGE: @@ -86,36 +101,36 @@ This product bundles Netty with the following in its NOTICE file: This product bundles gRPC with the following in its NOTICE file: | Copyright 2014 The gRPC Authors -| +| | Licensed under the Apache License, Version 2.0 (the "License"); | you may not use this file except in compliance with the License. | You may obtain a copy of the License at -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | See the License for the specific language governing permissions and | limitations under the License. -| +| | ----------------------------------------------------------------------- -| +| | This product contains a modified portion of 'OkHttp', an open source | HTTP & SPDY client for Android and Java applications, which can be obtained | at: -| +| | * LICENSE: | * okhttp/third_party/okhttp/LICENSE (Apache License 2.0) | * HOMEPAGE: | * https://github.com/square/okhttp | * LOCATION_IN_GRPC: | * okhttp/third_party/okhttp -| +| | This product contains a modified portion of 'Envoy', an open source | cloud-native high-performance edge/middle/service proxy, which can be | obtained at: -| +| | * LICENSE: | * xds/third_party/envoy/LICENSE (Apache License 2.0) | * NOTICE: @@ -124,11 +139,11 @@ This product bundles gRPC with the following in its NOTICE file: | * https://www.envoyproxy.io | * LOCATION_IN_GRPC: | * xds/third_party/envoy -| +| | This product contains a modified portion of 'protoc-gen-validate (PGV)', | an open source protoc plugin to generate polyglot message validators, | which can be obtained at: -| +| | * LICENSE: | * xds/third_party/protoc-gen-validate/LICENSE (Apache License 2.0) | * NOTICE: @@ -137,10 +152,10 @@ This product bundles gRPC with the following in its NOTICE file: | * https://github.com/envoyproxy/protoc-gen-validate | * LOCATION_IN_GRPC: | * xds/third_party/protoc-gen-validate -| +| | This product contains a modified portion of 'udpa', | an open source universal data plane API, which can be obtained at: -| +| | * LICENSE: | * xds/third_party/udpa/LICENSE (Apache License 2.0) | * HOMEPAGE: @@ -152,41 +167,41 @@ This product bundles gRPC with the following in its NOTICE file: This product bundles Perfmark with the following in its NOTICE file: | Copyright 2019 Google LLC -| +| | Licensed under the Apache License, Version 2.0 (the "License"); | you may not use this file except in compliance with the License. | You may obtain a copy of the License at -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | See the License for the specific language governing permissions and | limitations under the License. -| +| | ----------------------------------------------------------------------- -| +| | This product contains a modified portion of 'Catapult', an open source -| Trace Event viewer for Chome, Linux, and Android applications, which can +| Trace Event viewer for Chome, Linux, and Android applications, which can | be obtained at: -| +| | * LICENSE: | * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/catapult/LICENSE (New BSD License) | * HOMEPAGE: | * https://github.com/catapult-project/catapult -| +| | This product contains a modified portion of 'Polymer', a library for Web | Components, which can be obtained at: | * LICENSE: | * traceviewer/src/main/resources/io/perfmark/traceviewer/third_party/polymer/LICENSE (New BSD License) | * HOMEPAGE: | * https://github.com/Polymer/polymer -| -| +| +| | This product contains a modified portion of 'ASM', an open source | Java Bytecode library, which can be obtained at: -| +| | * LICENSE: | * agent/src/main/resources/io/perfmark/agent/third_party/asm/LICENSE (BSD style License) | * HOMEPAGE: @@ -196,61 +211,39 @@ This product bundles Perfmark with the following in its NOTICE file: This product bundles Conscrypt (openjdk-uber) with the following in its NOTICE file: | Copyright 2016 The Android Open Source Project -| +| | Licensed under the Apache License, Version 2.0 (the "License"); | you may not use this file except in compliance with the License. | You may obtain a copy of the License at -| +| | http://www.apache.org/licenses/LICENSE-2.0 -| +| | Unless required by applicable law or agreed to in writing, software | distributed under the License is distributed on an "AS IS" BASIS, | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | See the License for the specific language governing permissions and | limitations under the License. -| +| | ----------------------------------------------------------------------- | This product contains a modified portion of `Netty`, a configurable network | stack in Java, which can be obtained at: -| +| | * LICENSE: | * licenses/LICENSE.netty.txt (Apache License 2.0) | * HOMEPAGE: | * http://netty.io/ -| +| | This product contains a modified portion of `Apache Harmony`, modular Java runtime, | which can be obtained at: -| +| | * LICENSE: | * licenses/LICENSE.harmony.txt (Apache License 2.0) | * HOMEPAGE: | * https://harmony.apache.org/ -------------------------------------------------------------------------------- -This product bundles GCS Analytics Core with the following in its NOTICE file: -| # GCS Analytics Core -| -| GCS Analytics Core is a Java library designed to optimize analytics workloads on -| Google Cloud Storage (GCS). -| -| Copyright Google LLC -| -| Licensed under the Apache License, Version 2.0 (the "License"); -| you may not use this file except in compliance with the License. -| You may obtain a copy of the License at -| -| http://www.apache.org/licenses/LICENSE-2.0 -| -| Unless required by applicable law or agreed to in writing, software -| distributed under the License is distributed on an "AS IS" BASIS, -| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -| See the License for the specific language governing permissions and -| limitations under the License. - --------------------------------------------------------------------------------- - This product bundles Envoy with the following in its NOTICE file: | Envoy | Copyright The Envoy Project Authors -| +| | Licensed under Apache License 2.0. See LICENSE for terms. diff --git a/gcp-bundle/build.gradle b/gcp-bundle/build.gradle index d48763b5e366..9c4907bcdaa5 100644 --- a/gcp-bundle/build.gradle +++ b/gcp-bundle/build.gradle @@ -26,6 +26,7 @@ project(":iceberg-gcp-bundle") { configurations { implementation { exclude group: 'com.google.code.findbugs', module: 'jsr305' + exclude group: 'org.slf4j' } } @@ -48,10 +49,6 @@ project(":iceberg-gcp-bundle") { include 'NOTICE' } - dependencies { - exclude(dependency('org.slf4j:slf4j-api')) - } - // relocate GCP-specific versions relocate 'com.fasterxml.jackson', 'org.apache.iceberg.gcp.shaded.com.fasterxml.jackson' relocate 'com.google.common', 'org.apache.iceberg.gcp.shaded.com.google.common' diff --git a/gcp-bundle/runtime-deps.txt b/gcp-bundle/runtime-deps.txt index 9e471a7841d4..a109d4fb5676 100644 --- a/gcp-bundle/runtime-deps.txt +++ b/gcp-bundle/runtime-deps.txt @@ -108,6 +108,5 @@ org.codehaus.woodstox:stax2-api:4.2.2 org.conscrypt:conscrypt-openjdk-uber:2.5.2 org.json:json:20250517 org.jspecify:jspecify:1.0.0 -org.slf4j:slf4j-api:2.0.17 org.threeten:threeten-extra:1.8.0 org.threeten:threetenbp:1.7.0 From 86823e5a51b53f6154e824f24f3fe128e53f301f Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 6 May 2026 17:13:29 -0400 Subject: [PATCH 172/197] Spark: Fix LICENSE/NOTICE compliance for all versions of spark-runtime (v3.4, v3.5, v4.0, v4.1) (#16215) --- spark/v3.4/spark-runtime/LICENSE | 542 ++++++++++++++++++++++++++++++- spark/v3.4/spark-runtime/NOTICE | 37 ++- spark/v3.5/spark-runtime/LICENSE | 542 ++++++++++++++++++++++++++++++- spark/v3.5/spark-runtime/NOTICE | 37 ++- spark/v4.0/spark-runtime/LICENSE | 542 ++++++++++++++++++++++++++++++- spark/v4.0/spark-runtime/NOTICE | 35 ++ spark/v4.1/spark-runtime/LICENSE | 542 ++++++++++++++++++++++++++++++- spark/v4.1/spark-runtime/NOTICE | 37 ++- 8 files changed, 2299 insertions(+), 15 deletions(-) diff --git a/spark/v3.4/spark-runtime/LICENSE b/spark/v3.4/spark-runtime/LICENSE index 24a9e3706d17..3aceb9b01aa9 100644 --- a/spark/v3.4/spark-runtime/LICENSE +++ b/spark/v3.4/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -360,6 +448,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -414,6 +503,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles JCTools (via Netty). + +Project URL: https://github.com/JCTools/JCTools +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Google FlatBuffers. Copyright: 2013-2020 Google Inc. @@ -427,6 +523,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -508,6 +605,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles failsafe. Copyright: Jonathan Halterman and friends @@ -526,7 +1004,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Eclipse Collections. Project URL: https://github.com/eclipse-collections/eclipse-collections -License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- @@ -540,8 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -549,4 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - diff --git a/spark/v3.4/spark-runtime/NOTICE b/spark/v3.4/spark-runtime/NOTICE index 17989b43a371..c038e853af77 100644 --- a/spark/v3.4/spark-runtime/NOTICE +++ b/spark/v3.4/spark-runtime/NOTICE @@ -335,6 +335,42 @@ This product bundles Project Nessie with the following in its NOTICE file: -------------------------------------------------------------------------------- +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor +| +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| +| ## Licensing +| +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. +| +| ## Credits +| +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. + +-------------------------------------------------------------------------------- + This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTICE file: | ========================================================================= | == NOTICE file corresponding to section 4(d) of the Apache License, == @@ -355,4 +391,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | PackageCopyrightText: | Arthur De Magalhaes arthurdm@ca.ibm.com | - diff --git a/spark/v3.5/spark-runtime/LICENSE b/spark/v3.5/spark-runtime/LICENSE index 24a9e3706d17..50c91faf8edb 100644 --- a/spark/v3.5/spark-runtime/LICENSE +++ b/spark/v3.5/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -347,6 +435,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Error Prone Annotations. Copyright: Copyright 2011-2019 The Error Prone Authors @@ -360,6 +829,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -414,6 +884,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles JCTools (via Netty). + +Project URL: https://github.com/JCTools/JCTools +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Google FlatBuffers. Copyright: 2013-2020 Google Inc. @@ -427,6 +904,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -526,7 +1004,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Eclipse Collections. Project URL: https://github.com/eclipse-collections/eclipse-collections -License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- @@ -540,8 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -549,4 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - diff --git a/spark/v3.5/spark-runtime/NOTICE b/spark/v3.5/spark-runtime/NOTICE index 17989b43a371..c038e853af77 100644 --- a/spark/v3.5/spark-runtime/NOTICE +++ b/spark/v3.5/spark-runtime/NOTICE @@ -335,6 +335,42 @@ This product bundles Project Nessie with the following in its NOTICE file: -------------------------------------------------------------------------------- +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor +| +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| +| ## Licensing +| +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. +| +| ## Credits +| +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. + +-------------------------------------------------------------------------------- + This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTICE file: | ========================================================================= | == NOTICE file corresponding to section 4(d) of the Apache License, == @@ -355,4 +391,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | PackageCopyrightText: | Arthur De Magalhaes arthurdm@ca.ibm.com | - diff --git a/spark/v4.0/spark-runtime/LICENSE b/spark/v4.0/spark-runtime/LICENSE index 24a9e3706d17..50c91faf8edb 100644 --- a/spark/v4.0/spark-runtime/LICENSE +++ b/spark/v4.0/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -347,6 +435,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Error Prone Annotations. Copyright: Copyright 2011-2019 The Error Prone Authors @@ -360,6 +829,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -414,6 +884,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles JCTools (via Netty). + +Project URL: https://github.com/JCTools/JCTools +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Google FlatBuffers. Copyright: 2013-2020 Google Inc. @@ -427,6 +904,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -526,7 +1004,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Eclipse Collections. Project URL: https://github.com/eclipse-collections/eclipse-collections -License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- @@ -540,8 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -549,4 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - diff --git a/spark/v4.0/spark-runtime/NOTICE b/spark/v4.0/spark-runtime/NOTICE index 17989b43a371..797765628363 100644 --- a/spark/v4.0/spark-runtime/NOTICE +++ b/spark/v4.0/spark-runtime/NOTICE @@ -356,3 +356,38 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | Arthur De Magalhaes arthurdm@ca.ibm.com | +-------------------------------------------------------------------------------- + +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor +| +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| +| ## Licensing +| +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. +| +| ## Credits +| +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. diff --git a/spark/v4.1/spark-runtime/LICENSE b/spark/v4.1/spark-runtime/LICENSE index 24a9e3706d17..50c91faf8edb 100644 --- a/spark/v4.1/spark-runtime/LICENSE +++ b/spark/v4.1/spark-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2024 The Apache Software Foundation @@ -347,6 +435,387 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. + +-------------------------------------------------------------------------------- + This product bundles Google Error Prone Annotations. Copyright: Copyright 2011-2019 The Error Prone Authors @@ -360,6 +829,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2019 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT license + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -414,6 +884,13 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles JCTools (via Netty). + +Project URL: https://github.com/JCTools/JCTools +License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + This product bundles Google FlatBuffers. Copyright: 2013-2020 Google Inc. @@ -427,6 +904,7 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -526,7 +1004,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles Eclipse Collections. Project URL: https://github.com/eclipse-collections/eclipse-collections -License: Eclipse Public License v. 1.0 - https://www.eclipse.org/legal/epl-v10.html +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- @@ -540,8 +1048,37 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| * Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| * Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| * Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- This product bundles RoaringBitmap. @@ -549,4 +1086,3 @@ This product bundles RoaringBitmap. Copyright: (c) 2013-... the RoaringBitmap authors Project URL: https://github.com/RoaringBitmap/RoaringBitmap License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0 - diff --git a/spark/v4.1/spark-runtime/NOTICE b/spark/v4.1/spark-runtime/NOTICE index 17989b43a371..551ef59f2010 100644 --- a/spark/v4.1/spark-runtime/NOTICE +++ b/spark/v4.1/spark-runtime/NOTICE @@ -317,6 +317,42 @@ This product bundles Netty with the following in its NOTICE file: -------------------------------------------------------------------------------- +This product bundles Jackson JSON Processor with the following in its NOTICE file: +| # Jackson JSON processor +| +| Jackson is a high-performance, Free/Open Source JSON processing library. +| It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +| been in development since 2007. +| It is currently developed by a community of developers. +| +| ## Copyright +| +| Copyright 2007-, Tatu Saloranta (tatu.saloranta@iki.fi) +| +| ## Licensing +| +| Jackson 2.x core and extension components are licensed under Apache License 2.0 +| To find the details that apply to this artifact see the accompanying LICENSE file. +| +| ## Credits +| +| A list of contributors may be found from CREDITS(-2.x) file, which is included +| in some artifacts (usually source distributions); but is always available +| from the source code management (SCM) system project uses. +| +| ## FastDoubleParser +| +| jackson-core bundles a shaded copy of FastDoubleParser . +| That code is available under an MIT license +| under the following copyright. +| +| Copyright © 2023 Werner Randelshofer, Switzerland. MIT License. +| +| See FastDoubleParser-NOTICE for details of other source code included in FastDoubleParser +| and the licenses and copyrights that apply to that code. + +-------------------------------------------------------------------------------- + This product bundles Project Nessie with the following in its NOTICE file: | Nessie | Copyright 2015-2025 Dremio Corporation @@ -355,4 +391,3 @@ This product bundles Eclipse MicroProfile OpenAPI with the following in its NOTI | PackageCopyrightText: | Arthur De Magalhaes arthurdm@ca.ibm.com | - From 0f657edf12dc29f8487a679bfdd4210e9588d014 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 6 May 2026 17:13:50 -0400 Subject: [PATCH 173/197] Flink: Fix LICENSE/NOTICE compliance for all versions of flink-runtime (1.20, 2.0, 2.1) (#16216) --- flink/v1.20/flink-runtime/LICENSE | 511 +++++++++++++++++++++++++++++- flink/v2.0/flink-runtime/LICENSE | 511 +++++++++++++++++++++++++++++- flink/v2.1/flink-runtime/LICENSE | 511 +++++++++++++++++++++++++++++- 3 files changed, 1524 insertions(+), 9 deletions(-) diff --git a/flink/v1.20/flink-runtime/LICENSE b/flink/v1.20/flink-runtime/LICENSE index 364652a5aca2..d73eda0104b9 100644 --- a/flink/v1.20/flink-runtime/LICENSE +++ b/flink/v1.20/flink-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2020 The Apache Software Foundation. @@ -243,7 +331,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil (bundled by Parquet). +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -351,6 +439,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -394,6 +483,8 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + +| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -468,7 +559,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Eclipse Microprofile OpenAPI. +This product bundles Eclipse MicroProfile OpenAPI. Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation Project URL: https://github.com/microprofile/microprofile-open-api @@ -481,6 +572,7 @@ This product bundles Luben Zstd. Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. Project URL: https://github.com/luben/zstd-jni/ License: BSD 2-Clause + | Zstd-jni: JNI bindings to Zstd Library | | Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. @@ -513,7 +605,420 @@ License: BSD 2-Clause This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| - Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- diff --git a/flink/v2.0/flink-runtime/LICENSE b/flink/v2.0/flink-runtime/LICENSE index 364652a5aca2..d73eda0104b9 100644 --- a/flink/v2.0/flink-runtime/LICENSE +++ b/flink/v2.0/flink-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2020 The Apache Software Foundation. @@ -243,7 +331,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil (bundled by Parquet). +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -351,6 +439,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -394,6 +483,8 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + +| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -468,7 +559,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Eclipse Microprofile OpenAPI. +This product bundles Eclipse MicroProfile OpenAPI. Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation Project URL: https://github.com/microprofile/microprofile-open-api @@ -481,6 +572,7 @@ This product bundles Luben Zstd. Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. Project URL: https://github.com/luben/zstd-jni/ License: BSD 2-Clause + | Zstd-jni: JNI bindings to Zstd Library | | Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. @@ -513,7 +605,420 @@ License: BSD 2-Clause This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| - Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- diff --git a/flink/v2.1/flink-runtime/LICENSE b/flink/v2.1/flink-runtime/LICENSE index 364652a5aca2..d73eda0104b9 100644 --- a/flink/v2.1/flink-runtime/LICENSE +++ b/flink/v2.1/flink-runtime/LICENSE @@ -219,6 +219,94 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- +This product bundles FastDoubleParser (via Jackson JSON Processor). + +Copyright: 2023 Werner Randelshofer, Switzerland +Project URL: https://github.com/wrandelshofer/FastDoubleParser +License: MIT + +| Copyright (c) 2023 Werner Randelshofer, Switzerland +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles fast_float (bundled by FastDoubleParser). + +Copyright: 2021 The fast_float authors +Project URL: https://github.com/fastfloat/fast_float +License: MIT + +| Copyright (c) 2021 The fast_float authors +| +| Permission is hereby granted, free of charge, to any person obtaining a copy +| of this software and associated documentation files (the "Software"), to deal +| in the Software without restriction, including without limitation the rights +| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +| copies of the Software, and to permit persons to whom the Software is +| furnished to do so, subject to the following conditions: +| +| The above copyright notice and this permission notice shall be included in all +| copies or substantial portions of the Software. +| +| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +| SOFTWARE. + +-------------------------------------------------------------------------------- + +This product bundles bigint (bundled by FastDoubleParser). + +Copyright: 2022 Tim Buktu +Project URL: https://github.com/tbuktu/bigint +License: BSD 2-Clause + +| Copyright 2022 Tim Buktu +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions +| are met: +| +| 1. Redistributions of source code must retain the above copyright notice, this +| list of conditions and the following disclaimer. +| +| 2. Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +| ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +| WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +| DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +| FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + This product bundles Apache Parquet. Copyright: 2014-2020 The Apache Software Foundation. @@ -243,7 +331,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Fastutil (bundled by Parquet). +This product bundles fastutil (bundled by Parquet). Copyright: 2002-2014 Sebastiano Vigna Project URL: http://fastutil.di.unimi.it/ @@ -351,6 +439,7 @@ This product bundles checkerframework checker-qual. Copyright: 2004-2020 the Checker Framework developers Project URL: https://github.com/typetools/checker-framework License: MIT + | The annotations are licensed under the MIT License. (The text of this | license appears below.) More specifically, all the parts of the Checker | Framework that you might want to include with your own program use the @@ -394,6 +483,8 @@ This product bundles ThreeTen Extra. Copyright: 2007-present, Stephen Colebourne & Michael Nascimento Santos. Project URL: https://www.threeten.org/threeten-extra/ License: BSD 3-Clause + +| Copyright (c) 2007-present, Stephen Colebourne & Michael Nascimento Santos. | All rights reserved. | | * Redistribution and use in source and binary forms, with or without @@ -468,7 +559,7 @@ License: Apache License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2 -------------------------------------------------------------------------------- -This product bundles Eclipse Microprofile OpenAPI. +This product bundles Eclipse MicroProfile OpenAPI. Copyright: Copyright (c) 2017 Contributors to the Eclipse Foundation Project URL: https://github.com/microprofile/microprofile-open-api @@ -481,6 +572,7 @@ This product bundles Luben Zstd. Copyright: Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. Project URL: https://github.com/luben/zstd-jni/ License: BSD 2-Clause + | Zstd-jni: JNI bindings to Zstd Library | | Copyright (c) 2015-present, Luben Karavelov/ All rights reserved. @@ -513,7 +605,420 @@ License: BSD 2-Clause This product bundles JTS Topology Suite. Project URL: https://github.com/locationtech/jts -License: Eclipse Public License v. 2.0 - https://www.eclipse.org/org/documents/epl-2.0/EPL-2.0.txt +License: Eclipse Distribution License v. 1.0 - https://www.eclipse.org/org/documents/edl-v10.php + +| Eclipse Distribution License - v 1.0 +| +| Copyright (c) 2007, Eclipse Foundation, Inc. and its licensors. +| +| All rights reserved. +| +| Redistribution and use in source and binary forms, with or without +| modification, are permitted provided that the following conditions are met: +| +| - Redistributions of source code must retain the above copyright notice, +| this list of conditions and the following disclaimer. +| +| - Redistributions in binary form must reproduce the above copyright notice, +| this list of conditions and the following disclaimer in the documentation +| and/or other materials provided with the distribution. +| +| - Neither the name of the Eclipse Foundation, Inc. nor the names of its +| contributors may be used to endorse or promote products derived from this +| software without specific prior written permission. +| +| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +| SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +| INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +| CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +| ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +| POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This product bundles the Mozilla Public Suffix List, distributed by Apache HttpComponents. + +Project URL: https://publicsuffix.org/ +License: Mozilla Public License, Version 2.0 - https://mozilla.org/MPL/2.0/ + +| Mozilla Public License Version 2.0 +| ================================== +| +| 1. Definitions +| -------------- +| +| 1.1. "Contributor" +| means each individual or legal entity that creates, contributes to +| the creation of, or owns Covered Software. +| +| 1.2. "Contributor Version" +| means the combination of the Contributions of others (if any) used +| by a Contributor and that particular Contributor's Contribution. +| +| 1.3. "Contribution" +| means Covered Software of a particular Contributor. +| +| 1.4. "Covered Software" +| means Source Code Form to which the initial Contributor has attached +| the notice in Exhibit A, the Executable Form of such Source Code +| Form, and Modifications of such Source Code Form, in each case +| including portions thereof. +| +| 1.5. "Incompatible With Secondary Licenses" +| means +| +| (a) that the initial Contributor has attached the notice described +| in Exhibit B to the Covered Software; or +| +| (b) that the Covered Software was made available under the terms of +| version 1.1 or earlier of the License, but not also under the +| terms of a Secondary License. +| +| 1.6. "Executable Form" +| means any form of the work other than Source Code Form. +| +| 1.7. "Larger Work" +| means a work that combines Covered Software with other material, in +| a separate file or files, that is not Covered Software. +| +| 1.8. "License" +| means this document. +| +| 1.9. "Licensable" +| means having the right to grant, to the maximum extent possible, +| whether at the time of the initial grant or subsequently, any and +| all of the rights conveyed by this License. +| +| 1.10. "Modifications" +| means any of the following: +| +| (a) any file in Source Code Form that results from an addition to, +| deletion from, or modification of the contents of Covered +| Software; or +| +| (b) any new file in Source Code Form that contains any Covered +| Software. +| +| 1.11. "Patent Claims" of a Contributor +| means any patent claim(s), including without limitation, method, +| process, and apparatus claims, in any patent Licensable by such +| Contributor that would be infringed, but for the grant of the +| License, by the making, using, selling, offering for sale, having +| made, import, or transfer of either its Contributions or its +| Contributor Version. +| +| 1.12. "Secondary License" +| means either the GNU General Public License, Version 2.0, the GNU +| Lesser General Public License, Version 2.1, the GNU Affero General +| Public License, Version 3.0, or any later versions of those +| licenses. +| +| 1.13. "Source Code Form" +| means the form of the work preferred for making modifications. +| +| 1.14. "You" (or "Your") +| means an individual or a legal entity exercising rights under this +| License. For legal entities, "You" includes any entity that +| controls, is controlled by, or is under common control with You. For +| purposes of this definition, "control" means (a) the power, direct +| or indirect, to cause the direction or management of such entity, +| whether by contract or otherwise, or (b) ownership of more than +| fifty percent (50%) of the outstanding shares or beneficial +| ownership of such entity. +| +| 2. License Grants and Conditions +| -------------------------------- +| +| 2.1. Grants +| +| Each Contributor hereby grants You a world-wide, royalty-free, +| non-exclusive license: +| +| (a) under intellectual property rights (other than patent or trademark) +| Licensable by such Contributor to use, reproduce, make available, +| modify, display, perform, distribute, and otherwise exploit its +| Contributions, either on an unmodified basis, with Modifications, or +| as part of a Larger Work; and +| +| (b) under Patent Claims of such Contributor to make, use, sell, offer +| for sale, have made, import, and otherwise transfer either its +| Contributions or its Contributor Version. +| +| 2.2. Effective Date +| +| The licenses granted in Section 2.1 with respect to any Contribution +| become effective for each Contribution on the date the Contributor first +| distributes such Contribution. +| +| 2.3. Limitations on Grant Scope +| +| The licenses granted in this Section 2 are the only rights granted under +| this License. No additional rights or licenses will be implied from the +| distribution or licensing of Covered Software under this License. +| Notwithstanding Section 2.1(b) above, no patent license is granted by a +| Contributor: +| +| (a) for any code that a Contributor has removed from Covered Software; +| or +| +| (b) for infringements caused by: (i) Your and any other third party's +| modifications of Covered Software, or (ii) the combination of its +| Contributions with other software (except as part of its Contributor +| Version); or +| +| (c) under Patent Claims infringed by Covered Software in the absence of +| its Contributions. +| +| This License does not grant any rights in the trademarks, service marks, +| or logos of any Contributor (except as may be necessary to comply with +| the notice requirements in Section 3.4). +| +| 2.4. Subsequent Licenses +| +| No Contributor makes additional grants as a result of Your choice to +| distribute the Covered Software under a subsequent version of this +| License (see Section 10.2) or under the terms of a Secondary License (if +| permitted under the terms of Section 3.3). +| +| 2.5. Representation +| +| Each Contributor represents that the Contributor believes its +| Contributions are its original creation(s) or it has sufficient rights +| to grant the rights to its Contributions conveyed by this License. +| +| 2.6. Fair Use +| +| This License is not intended to limit any rights You have under +| applicable copyright doctrines of fair use, fair dealing, or other +| equivalents. +| +| 2.7. Conditions +| +| Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +| in Section 2.1. +| +| 3. Responsibilities +| ------------------- +| +| 3.1. Distribution of Source Form +| +| All distribution of Covered Software in Source Code Form, including any +| Modifications that You create or to which You contribute, must be under +| the terms of this License. You must inform recipients that the Source +| Code Form of the Covered Software is governed by the terms of this +| License, and how they can obtain a copy of this License. You may not +| attempt to alter or restrict the recipients' rights in the Source Code +| Form. +| +| 3.2. Distribution of Executable Form +| +| If You distribute Covered Software in Executable Form then: +| +| (a) such Covered Software must also be made available in Source Code +| Form, as described in Section 3.1, and You must inform recipients of +| the Executable Form how they can obtain a copy of such Source Code +| Form by reasonable means in a timely manner, at a charge no more +| than the cost of distribution to the recipient; and +| +| (b) You may distribute such Executable Form under the terms of this +| License, or sublicense it under different terms, provided that the +| license for the Executable Form does not attempt to limit or alter +| the recipients' rights in the Source Code Form under this License. +| +| 3.3. Distribution of a Larger Work +| +| You may create and distribute a Larger Work under terms of Your choice, +| provided that You also comply with the requirements of this License for +| the Covered Software. If the Larger Work is a combination of Covered +| Software with a work governed by one or more Secondary Licenses, and the +| Covered Software is not Incompatible With Secondary Licenses, this +| License permits You to additionally distribute such Covered Software +| under the terms of such Secondary License(s), so that the recipient of +| the Larger Work may, at their option, further distribute the Covered +| Software under the terms of either this License or such Secondary +| License(s). +| +| 3.4. Notices +| +| You may not remove or alter the substance of any license notices +| (including copyright notices, patent notices, disclaimers of warranty, +| or limitations of liability) contained within the Source Code Form of +| the Covered Software, except that You may alter any license notices to +| the extent required to remedy known factual inaccuracies. +| +| 3.5. Application of Additional Terms +| +| You may choose to offer, and to charge a fee for, warranty, support, +| indemnity or liability obligations to one or more recipients of Covered +| Software. However, You may do so only on Your own behalf, and not on +| behalf of any Contributor. You must make it absolutely clear that any +| such warranty, support, indemnity, or liability obligation is offered by +| You alone, and You hereby agree to indemnify every Contributor for any +| liability incurred by such Contributor as a result of warranty, support, +| indemnity or liability terms You offer. You may include additional +| disclaimers of warranty and limitations of liability specific to any +| jurisdiction. +| +| 4. Inability to Comply Due to Statute or Regulation +| --------------------------------------------------- +| +| If it is impossible for You to comply with any of the terms of this +| License with respect to some or all of the Covered Software due to +| statute, judicial order, or regulation then You must: (a) comply with +| the terms of this License to the maximum extent possible; and (b) +| describe the limitations and the code they affect. Such description must +| be placed in a text file included with all distributions of the Covered +| Software under this License. Except to the extent prohibited by statute +| or regulation, such description must be sufficiently detailed for a +| recipient of ordinary skill to be able to understand it. +| +| 5. Termination +| -------------- +| +| 5.1. The rights granted under this License will terminate automatically +| if You fail to comply with any of its terms. However, if You become +| compliant, then the rights granted under this License from a particular +| Contributor are reinstated (a) provisionally, unless and until such +| Contributor explicitly and finally terminates Your grants, and (b) on an +| ongoing basis, if such Contributor fails to notify You of the +| non-compliance by some reasonable means prior to 60 days after You have +| come back into compliance. Moreover, Your grants from a particular +| Contributor are reinstated on an ongoing basis if such Contributor +| notifies You of the non-compliance by some reasonable means, this is the +| first time You have received notice of non-compliance with this License +| from such Contributor, and You become compliant prior to 30 days after +| Your receipt of the notice. +| +| 5.2. If You initiate litigation against any entity by asserting a patent +| infringement claim (excluding declaratory judgment actions, +| counter-claims, and cross-claims) alleging that a Contributor Version +| directly or indirectly infringes any patent, then the rights granted to +| You by any and all Contributors for the Covered Software under Section +| 2.1 of this License shall terminate. +| +| 5.3. In the event of termination under Sections 5.1 or 5.2 above, all +| end user license agreements (excluding distributors and resellers) which +| have been validly granted by You or Your distributors under this License +| prior to termination shall survive termination. +| +| ************************************************************************ +| * * +| * 6. Disclaimer of Warranty * +| * ------------------------- * +| * * +| * Covered Software is provided under this License on an "as is" * +| * basis, without warranty of any kind, either expressed, implied, or * +| * statutory, including, without limitation, warranties that the * +| * Covered Software is free of defects, merchantable, fit for a * +| * particular purpose or non-infringing. The entire risk as to the * +| * quality and performance of the Covered Software is with You. * +| * Should any Covered Software prove defective in any respect, You * +| * (not any Contributor) assume the cost of any necessary servicing, * +| * repair, or correction. This disclaimer of warranty constitutes an * +| * essential part of this License. No use of any Covered Software is * +| * authorized under this License except under this disclaimer. * +| * * +| ************************************************************************ +| +| ************************************************************************ +| * * +| * 7. Limitation of Liability * +| * -------------------------- * +| * * +| * Under no circumstances and under no legal theory, whether tort * +| * (including negligence), contract, or otherwise, shall any * +| * Contributor, or anyone who distributes Covered Software as * +| * permitted above, be liable to You for any direct, indirect, * +| * special, incidental, or consequential damages of any character * +| * including, without limitation, damages for lost profits, loss of * +| * goodwill, work stoppage, computer failure or malfunction, or any * +| * and all other commercial damages or losses, even if such party * +| * shall have been informed of the possibility of such damages. This * +| * limitation of liability shall not apply to liability for death or * +| * personal injury resulting from such party's negligence to the * +| * extent applicable law prohibits such limitation. Some * +| * jurisdictions do not allow the exclusion or limitation of * +| * incidental or consequential damages, so this exclusion and * +| * limitation may not apply to You. * +| * * +| ************************************************************************ +| +| 8. Litigation +| ------------- +| +| Any litigation relating to this License may be brought only in the +| courts of a jurisdiction where the defendant maintains its principal +| place of business and such litigation shall be governed by laws of that +| jurisdiction, without reference to its conflict-of-law provisions. +| Nothing in this Section shall prevent a party's ability to bring +| cross-claims or counter-claims. +| +| 9. Miscellaneous +| ---------------- +| +| This License represents the complete agreement concerning the subject +| matter hereof. If any provision of this License is held to be +| unenforceable, such provision shall be reformed only to the extent +| necessary to make it enforceable. Any law or regulation which provides +| that the language of a contract shall be construed against the drafter +| shall not be used to construe this License against a Contributor. +| +| 10. Versions of the License +| --------------------------- +| +| 10.1. New Versions +| +| Mozilla Foundation is the license steward. Except as provided in Section +| 10.3, no one other than the license steward has the right to modify or +| publish new versions of this License. Each version will be given a +| distinguishing version number. +| +| 10.2. Effect of New Versions +| +| You may distribute the Covered Software under the terms of the version +| of the License under which You originally received the Covered Software, +| or under the terms of any subsequent version published by the license +| steward. +| +| 10.3. Modified Versions +| +| If you create software not governed by this License, and you want to +| create a new license for such software, you may create and use a +| modified version of this License if you rename the license and remove +| any references to the name of the license steward (except to note that +| such modified license differs from this License). +| +| 10.4. Distributing Source Code Form that is Incompatible With Secondary +| Licenses +| +| If You choose to distribute Source Code Form that is Incompatible With +| Secondary Licenses under the terms of this version of the License, the +| notice described in Exhibit B of this License must be attached. +| +| Exhibit A - Source Code Form License Notice +| ------------------------------------------- +| +| This Source Code Form is subject to the terms of the Mozilla Public +| License, v. 2.0. If a copy of the MPL was not distributed with this +| file, You can obtain one at http://mozilla.org/MPL/2.0/. +| +| If it is not possible or desirable to put the notice in a particular +| file, then You may include the notice in a location (such as a LICENSE +| file in a relevant directory) where a recipient would be likely to look +| for such a notice. +| +| You may add additional accurate notices of copyright ownership. +| +| Exhibit B - "Incompatible With Secondary Licenses" Notice +| --------------------------------------------------------- +| +| This Source Code Form is "Incompatible With Secondary Licenses", as +| defined by the Mozilla Public License, v. 2.0. -------------------------------------------------------------------------------- From 05b2df1bd234c95d8edb51f1b78ee9b60ff59021 Mon Sep 17 00:00:00 2001 From: Talat UYARER Date: Thu, 7 May 2026 03:02:23 -0700 Subject: [PATCH 174/197] Flink: Backport add Nanosecond Precision Support for Flink-Iceberg Integration (#16183) backports #15475 --- flink/v2.0/build.gradle | 1 + .../apache/iceberg/flink/FlinkTypeToType.java | 6 + .../apache/iceberg/flink/RowDataWrapper.java | 36 +- .../iceberg/flink/data/FlinkOrcReader.java | 7 + .../iceberg/flink/data/FlinkOrcWriter.java | 7 + .../iceberg/flink/data/FlinkOrcWriters.java | 37 ++ .../iceberg/flink/data/RowDataUtil.java | 2 + .../iceberg/flink/data/StructRowData.java | 48 +- .../formats/avro/AvroToRowDataConverters.java | 303 +++++++++ .../flink/formats/avro/JodaConverter.java | 69 ++ .../formats/avro/RowDataToAvroConverters.java | 394 +++++++++++ .../avro/typeutils/AvroSchemaConverter.java | 625 ++++++++++++++++++ .../AvroGenericRecordToRowDataMapper.java | 4 +- .../RowDataToAvroGenericRecordConverter.java | 4 +- .../reader/AvroGenericRecordConverter.java | 4 +- .../apache/iceberg/flink/DataGenerators.java | 105 ++- .../iceberg/flink/TestRowDataWrapper.java | 13 - .../flink/data/TestFlinkOrcReaderWriter.java | 5 + .../flink/data/TestRowDataProjection.java | 21 +- 19 files changed, 1640 insertions(+), 51 deletions(-) create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java create mode 100644 flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java diff --git a/flink/v2.0/build.gradle b/flink/v2.0/build.gradle index 94f851e03221..f80a31242112 100644 --- a/flink/v2.0/build.gradle +++ b/flink/v2.0/build.gradle @@ -33,6 +33,7 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink20.avro + compileOnly libs.joda.time // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink20.metrics.dropwizard compileOnly libs.flink20.streaming.java diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 408065f06057..8f106da8d56b 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -137,11 +137,17 @@ public Type visit(TimeType timeType) { @Override public Type visit(TimestampType timestampType) { + if (timestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withoutZone(); + } return Types.TimestampType.withoutZone(); } @Override public Type visit(LocalZonedTimestampType localZonedTimestampType) { + if (localZonedTimestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withZone(); + } return Types.TimestampType.withZone(); } diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 3ef611f2ded5..920e44b24b31 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -114,19 +114,35 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.nanosFromTimestamp(localDateTime); + }; + } else { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + } case TIMESTAMP_WITH_LOCAL_TIME_ZONE: LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + }; + } else { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + }; + } case ROW: RowType rowType = (RowType) logicalType; diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 65b9d44ad4b8..3e3a29112cf4 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -112,6 +112,13 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio } else { return FlinkOrcReaders.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } case STRING: return FlinkOrcReaders.strings(); case UUID: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index a467d848337d..c1b46252e18a 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -145,6 +145,13 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl } else { return FlinkOrcWriters.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampNanoTzs(); + } else { + return FlinkOrcWriters.timestampNanos(); + } case STRING: return FlinkOrcWriters.strings(); case UUID: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 684842aa099c..bf19a46c05fb 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -70,6 +70,14 @@ static OrcValueWriter timestampTzs() { return TimestampTzWriter.INSTANCE; } + static OrcValueWriter timestampNanos() { + return TimestampNanoWriter.INSTANCE; + } + + static OrcValueWriter timestampNanoTzs() { + return TimestampNanoTzWriter.INSTANCE; + } + static OrcValueWriter decimals(int precision, int scale) { if (precision <= 18) { return new Decimal18Writer(precision, scale); @@ -170,6 +178,35 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { } } + private static class TimestampNanoWriter implements OrcValueWriter { + private static final TimestampNanoWriter INSTANCE = new TimestampNanoWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.nanos[rowId] = offsetDateTime.getNano(); + } + } + + private static class TimestampNanoTzWriter implements OrcValueWriter { + private static final TimestampNanoTzWriter INSTANCE = new TimestampNanoTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + cv.nanos[rowId] = instant.getNano(); + } + } + private static class Decimal18Writer implements OrcValueWriter { private final int precision; private final int scale; diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index f23a7ee3d0d3..81bb55967992 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -69,6 +69,8 @@ public static Object convertConstant(Type type, Object value) { return (int) ((Long) value / 1000); case TIMESTAMP: // TimestampData return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + case TIMESTAMP_NANO: + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromNanos((Long) value)); case UUID: return UUIDUtil.convert((UUID) value); default: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index 34576a1e5c0b..f2a20be331b6 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -48,6 +48,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; @Internal public class StructRowData implements RowData { @@ -185,8 +186,27 @@ private BigDecimal getDecimalInternal(int pos) { @Override public TimestampData getTimestamp(int pos, int precision) { + if (precision > 6) { + Object timeVal = struct.get(pos, Object.class); + if (timeVal instanceof OffsetDateTime) { + OffsetDateTime odt = (OffsetDateTime) timeVal; + return TimestampData.fromEpochMillis( + odt.toInstant().toEpochMilli(), odt.getNano() % 1_000_000); + } else if (timeVal instanceof LocalDateTime) { + LocalDateTime ldt = (LocalDateTime) timeVal; + return TimestampData.fromEpochMillis( + ldt.toInstant(ZoneOffset.UTC).toEpochMilli(), ldt.getNano() % 1_000_000); + } else if (timeVal instanceof Long) { + long timeLong = (Long) timeVal; + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } else { + throw new IllegalStateException("Unknown type for timestamp_ns: " + timeVal.getClass()); + } + } long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1000); } @Override @@ -257,9 +277,29 @@ private Object convertValue(Type elementType, Object value) { case DECIMAL: return value; case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + long timeMillis; + if (value instanceof LocalDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamp((LocalDateTime) value) / 1000L; + } else if (value instanceof OffsetDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamptz((OffsetDateTime) value) / 1000L; + } else { + timeMillis = Math.floorDiv((Long) value, 1000L); + } + return TimestampData.fromEpochMillis( + timeMillis, + (int) Math.floorMod(value instanceof Long ? (Long) value : timeMillis * 1000L, 1000L) + * 1000); + case TIMESTAMP_NANO: + long nanoLong; + if (value instanceof LocalDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamp((LocalDateTime) value); + } else if (value instanceof OffsetDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamptz((OffsetDateTime) value); + } else { + nanoLong = (Long) value; + } + return TimestampData.fromEpochMillis( + Math.floorDiv(nanoLong, 1_000_000L), (int) Math.floorMod(nanoLong, 1_000_000L)); case STRING: return StringData.fromString(value.toString()); case FIXED: diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java new file mode 100644 index 000000000000..0f70e60a1b9f --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.temporal.ChronoField; +import java.util.List; +import java.util.Map; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.utils.LogicalTypeUtils; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Tool class used to convert from Avro {@link GenericRecord} to {@link RowData}. + * + *

      This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class AvroToRowDataConverters { + + private AvroToRowDataConverters() {} + + /** + * Runtime converter that converts Avro data structures into objects of Flink Table & SQL + * internal data structures. + */ + @FunctionalInterface + public interface AvroToRowDataConverter extends Serializable { + Object convert(Object object); + } + + // ------------------------------------------------------------------------------------- + // Runtime Converters + // ------------------------------------------------------------------------------------- + + public static AvroToRowDataConverter createRowConverter(RowType rowType) { + return createRowConverter(rowType, true); + } + + public static AvroToRowDataConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter[] fieldConverters = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .map(type -> createNullableConverter(type, legacyTimestampMapping)) + .toArray(AvroToRowDataConverter[]::new); + final int arity = rowType.getFieldCount(); + + return avroObject -> { + IndexedRecord record = (IndexedRecord) avroObject; + GenericRowData row = new GenericRowData(arity); + for (int i = 0; i < arity; ++i) { + // avro always deserialize successfully even though the type isn't matched + // so no need to throw exception about which field can't be deserialized + row.setField(i, fieldConverters[i].convert(record.get(i))); + } + return row; + }; + } + + /** Creates a runtime converter which is null safe. */ + private static AvroToRowDataConverter createNullableConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter converter = createConverter(type, legacyTimestampMapping); + return avroObject -> { + if (avroObject == null) { + return null; + } + return converter.convert(avroObject); + }; + } + + /** Creates a runtime converter which assuming input object is not null. */ + private static AvroToRowDataConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + switch (type.getTypeRoot()) { + case NULL: + return avroObject -> null; + case TINYINT: + return avroObject -> ((Integer) avroObject).byteValue(); + case SMALLINT: + return avroObject -> ((Integer) avroObject).shortValue(); + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + return avroObject -> avroObject; + case DATE: + return AvroToRowDataConverters::convertToDate; + case TIME_WITHOUT_TIME_ZONE: + return AvroToRowDataConverters::convertToTime; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + } + case CHAR: + case VARCHAR: + return avroObject -> StringData.fromString(avroObject.toString()); + case BINARY: + case VARBINARY: + return AvroToRowDataConverters::convertToBytes; + case DECIMAL: + return createDecimalConverter((DecimalType) type); + case ARRAY: + return createArrayConverter((ArrayType) type, legacyTimestampMapping); + case ROW: + return createRowConverter((RowType) type); + case MAP: + case MULTISET: + return createMapConverter(type, legacyTimestampMapping); + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) { + final int precision = decimalType.getPrecision(); + final int scale = decimalType.getScale(); + return avroObject -> { + final byte[] bytes; + if (avroObject instanceof GenericFixed) { + bytes = ((GenericFixed) avroObject).bytes(); + } else if (avroObject instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) avroObject; + bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + } else { + bytes = (byte[]) avroObject; + } + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + }; + } + + private static AvroToRowDataConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter elementConverter = + createNullableConverter(arrayType.getElementType(), legacyTimestampMapping); + final Class elementClass = + LogicalTypeUtils.toInternalConversionClass(arrayType.getElementType()); + + return avroObject -> { + final List list = (List) avroObject; + final int length = list.size(); + final Object[] array = (Object[]) Array.newInstance(elementClass, length); + for (int i = 0; i < length; ++i) { + array[i] = elementConverter.convert(list.get(i)); + } + return new GenericArrayData(array); + }; + } + + private static AvroToRowDataConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter keyConverter = + createConverter(DataTypes.STRING().getLogicalType(), legacyTimestampMapping); + final AvroToRowDataConverter valueConverter = + createNullableConverter( + AvroSchemaConverter.extractValueTypeToAvroMap(type), legacyTimestampMapping); + + return avroObject -> { + final Map map = (Map) avroObject; + Map result = Maps.newHashMap(); + for (Map.Entry entry : map.entrySet()) { + Object key = keyConverter.convert(entry.getKey()); + Object value = valueConverter.convert(entry.getValue()); + result.put(key, value); + } + return new GenericMapData(result); + }; + } + + private static TimestampData convertToTimestamp(Object object, LogicalType type) { + int precision = 3; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + precision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + precision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } + + if (object instanceof Long) { + long timeLong = (Long) object; + if (precision <= 3) { + return TimestampData.fromEpochMillis(timeLong); + } else if (precision <= 6) { + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1_000_000); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } + } else if (object instanceof Instant) { + return TimestampData.fromInstant((Instant) object); + } else if (object instanceof LocalDateTime) { + return TimestampData.fromLocalDateTime((LocalDateTime) object); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return TimestampData.fromEpochMillis(jodaConverter.convertTimestamp(object)); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIMESTAMP logical type. Received: " + object); + } + } + } + + private static int convertToDate(Object object) { + if (object instanceof Integer) { + return (Integer) object; + } else if (object instanceof LocalDate) { + return (int) ((LocalDate) object).toEpochDay(); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return (int) jodaConverter.convertDate(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for DATE logical type. Received: " + object); + } + } + } + + private static int convertToTime(Object object) { + final int millis; + if (object instanceof Integer) { + millis = (Integer) object; + } else if (object instanceof LocalTime) { + millis = ((LocalTime) object).get(ChronoField.MILLI_OF_DAY); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + millis = jodaConverter.convertTime(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIME logical type. Received: " + object); + } + } + return millis; + } + + private static byte[] convertToBytes(Object object) { + if (object instanceof GenericFixed) { + return ((GenericFixed) object).bytes(); + } else if (object instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) object; + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return bytes; + } else { + return (byte[]) object; + } + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java new file mode 100644 index 000000000000..c30b78023345 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import org.joda.time.DateTime; +import org.joda.time.DateTimeFieldType; +import org.joda.time.LocalDate; +import org.joda.time.LocalTime; + +/** + * Encapsulates joda optional dependency. Instantiates this class only if joda is available on the + * classpath. + */ +@SuppressWarnings("JavaUtilDate") +class JodaConverter { + + private static JodaConverter instance; + private static boolean instantiated = false; + + public static JodaConverter getConverter() { + if (instantiated) { + return instance; + } + + try { + Class.forName( + "org.joda.time.DateTime", false, Thread.currentThread().getContextClassLoader()); + instance = new JodaConverter(); + } catch (ClassNotFoundException e) { + instance = null; + } finally { + instantiated = true; + } + return instance; + } + + public long convertDate(Object object) { + final LocalDate value = (LocalDate) object; + return value.toDate().getTime(); + } + + public int convertTime(Object object) { + final LocalTime value = (LocalTime) object; + return value.get(DateTimeFieldType.millisOfDay()); + } + + public long convertTimestamp(Object object) { + final DateTime value = (DateTime) object; + return value.toDate().getTime(); + } + + private JodaConverter() {} +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java new file mode 100644 index 000000000000..d4c7e4282d6e --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Tool class used to convert from {@link RowData} to Avro {@link GenericRecord}. + * + *

      This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class RowDataToAvroConverters { + + private RowDataToAvroConverters() {} + + // -------------------------------------------------------------------------------- + // Runtime Converters + // -------------------------------------------------------------------------------- + + /** + * Runtime converter that converts objects of Flink Table & SQL internal data structures to + * corresponding Avro data structures. + */ + @FunctionalInterface + public interface RowDataToAvroConverter extends Serializable { + Object convert(Schema schema, Object object); + } + + // -------------------------------------------------------------------------------- + // IMPORTANT! We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in + // SerializedLambdas (MSHADE-260). On the other hand we want to relocate Avro for + // sql-client uber jars. + // -------------------------------------------------------------------------------- + + /** + * Creates a runtime converter according to the given logical type that converts objects of Flink + * Table & SQL internal data structures to corresponding Avro data structures. + */ + public static RowDataToAvroConverter createConverter(LogicalType type) { + return createConverter(type, true); + } + + @SuppressWarnings("checkstyle:MethodLength") + public static RowDataToAvroConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + final RowDataToAvroConverter converter; + switch (type.getTypeRoot()) { + case NULL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return null; + } + }; + break; + case TINYINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Byte) object).intValue(); + } + }; + break; + case SMALLINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Short) object).intValue(); + } + }; + break; + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case TIME_WITHOUT_TIME_ZONE: // int + case DATE: // int + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return object; + } + }; + break; + case CHAR: + case VARCHAR: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return new Utf8(object.toString()); + } + }; + break; + case BINARY: + case VARBINARY: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap((byte[]) object); + } + }; + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int tzPrecision; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + tzPrecision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else { + tzPrecision = 3; + } + if (legacyTimestampMapping) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (tzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (tzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + java.time.Instant instant = + timestampData.toLocalDateTime().toInstant(ZoneOffset.UTC); + if (tzPrecision <= 3) { + return instant.toEpochMilli(); + } else if (tzPrecision <= 6) { + return instant.getEpochSecond() * 1_000_000L + instant.getNano() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return instant.getEpochSecond() * 1_000_000_000L + instant.getNano(); + } + } + }; + } + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int ltzPrecision; + if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + ltzPrecision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } else { + ltzPrecision = 3; + } + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (ltzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (ltzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } + break; + case DECIMAL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap(((DecimalData) object).toUnscaledBytes()); + } + }; + break; + case ARRAY: + converter = createArrayConverter((ArrayType) type, legacyTimestampMapping); + break; + case ROW: + converter = createRowConverter((RowType) type, legacyTimestampMapping); + break; + case MAP: + case MULTISET: + converter = createMapConverter(type, legacyTimestampMapping); + break; + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + + // wrap into nullable converter + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + if (object == null) { + return null; + } + + // get actual schema if it is a nullable schema + Schema actualSchema; + if (schema.getType() == Schema.Type.UNION) { + List types = schema.getTypes(); + int size = types.size(); + if (size == 2 && types.get(1).getType() == Schema.Type.NULL) { + actualSchema = types.get(0); + } else if (size == 2 && types.get(0).getType() == Schema.Type.NULL) { + actualSchema = types.get(1); + } else { + throw new IllegalArgumentException( + "The Avro schema is not a nullable type: " + schema.toString()); + } + } else { + actualSchema = schema; + } + return converter.convert(actualSchema, object); + } + }; + } + + private static RowDataToAvroConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final RowDataToAvroConverter[] fieldConverters = + rowType.getChildren().stream() + .map(legacyType -> createConverter(legacyType, legacyTimestampMapping)) + .toArray(RowDataToAvroConverter[]::new); + final LogicalType[] fieldTypes = + rowType.getFields().stream().map(RowType.RowField::getType).toArray(LogicalType[]::new); + final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fieldTypes.length]; + for (int i = 0; i < fieldTypes.length; i++) { + fieldGetters[i] = RowData.createFieldGetter(fieldTypes[i], i); + } + final int length = rowType.getFieldCount(); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final RowData row = (RowData) object; + final List fields = schema.getFields(); + final GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < length; ++i) { + final Schema.Field schemaField = fields.get(i); + try { + Object avroObject = + fieldConverters[i].convert( + schemaField.schema(), fieldGetters[i].getFieldOrNull(row)); + record.put(i, avroObject); + } catch (Throwable t) { + throw new RuntimeException( + String.format("Fail to serialize at field: %s.", schemaField.name()), t); + } + } + return record; + } + }; + } + + private static RowDataToAvroConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + LogicalType elementType = arrayType.getElementType(); + final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + final RowDataToAvroConverter elementConverter = + createConverter(arrayType.getElementType(), legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema elementSchema = schema.getElementType(); + ArrayData arrayData = (ArrayData) object; + List list = Lists.newArrayList(); + for (int i = 0; i < arrayData.size(); ++i) { + list.add( + elementConverter.convert( + elementSchema, elementGetter.getElementOrNull(arrayData, i))); + } + return list; + } + }; + } + + private static RowDataToAvroConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); + final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); + final RowDataToAvroConverter valueConverter = + createConverter(valueType, legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema valueSchema = schema.getValueType(); + final MapData mapData = (MapData) object; + final ArrayData keyArray = mapData.keyArray(); + final ArrayData valueArray = mapData.valueArray(); + final Map map = CollectionUtil.newHashMapWithExpectedSize(mapData.size()); + for (int i = 0; i < mapData.size(); ++i) { + final String key = keyArray.getString(i).toString(); + final Object value = + valueConverter.convert(valueSchema, valueGetter.getElementOrNull(valueArray, i)); + map.put(key, value); + } + return map; + } + }; + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java new file mode 100644 index 000000000000..fb77c124e504 --- /dev/null +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java @@ -0,0 +1,625 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro.typeutils; + +import java.util.List; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.specific.SpecificData; +import org.apache.avro.specific.SpecificRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.formats.avro.AvroRowDataDeserializationSchema; +import org.apache.flink.formats.avro.AvroRowDataSerializationSchema; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.legacy.types.logical.TypeInformationRawType; +import org.apache.flink.table.types.AtomicDataType; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Converts an Avro schema into Flink's type information. It uses {@link RowTypeInfo} for + * representing objects and converts Avro types into types that are compatible with Flink's Table + * & SQL API. + * + *

      Note: Changes in this class need to be kept in sync with the corresponding runtime classes + * {@link AvroRowDataDeserializationSchema} and {@link AvroRowDataSerializationSchema}. + * + *

      This class is adapted in Iceberg to support custom 'timestamp-nanos' and + * 'local-timestamp-nanos' logical types (FLINK-39251). Once that ticket is resolved in Flink, these + * custom types may be removed. + */ +public class AvroSchemaConverter { + + private AvroSchemaConverter() { + // private + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass) { + return convertToTypeInfo(avroClass, true); + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroClass, "Avro specific record class must not be null."); + // determine schema to retrieve deterministic field order + final Schema schema = SpecificData.get().getSchema(avroClass); + return (TypeInformation) convertToTypeInfo(schema, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo(String avroSchemaString) { + return convertToTypeInfo(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return (TypeInformation) convertToTypeInfo(schema, legacyTimestampMapping); + } + + private static TypeInformation convertToTypeInfo( + Schema schema, boolean legacyTimestampMapping) { + switch (schema.getType()) { + case RECORD: + final List fields = schema.getFields(); + + final TypeInformation[] types = new TypeInformation[fields.size()]; + final String[] names = new String[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + final Schema.Field field = fields.get(i); + types[i] = convertToTypeInfo(field.schema(), legacyTimestampMapping); + names[i] = field.name(); + } + return Types.ROW_NAMED(names, types); + case ENUM: + return Types.STRING; + case ARRAY: + // result type might either be ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings + return Types.OBJECT_ARRAY( + convertToTypeInfo(schema.getElementType(), legacyTimestampMapping)); + case MAP: + return Types.MAP( + Types.STRING, convertToTypeInfo(schema.getValueType(), legacyTimestampMapping)); + case UNION: + final Schema actualSchema; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + } else { + // use Kryo for serialization + return Types.GENERIC(Object.class); + } + return convertToTypeInfo(actualSchema, legacyTimestampMapping); + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + // convert fixed size binary data to primitive byte arrays + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case STRING: + // convert Avro's Utf8/CharSequence to String + return Types.STRING; + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return Types.SQL_DATE; + } else if (logicalType == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + return Types.INT; + case LONG: + if (legacyTimestampMapping) { + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.SQL_TIMESTAMP; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } else { + // Avro logical timestamp types to Flink DataStream timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.INSTANT; + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis() + || schema.getLogicalType() == LogicalTypes.localTimestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos"))) { + return Types.LOCAL_DATE_TIME; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } + return Types.LONG; + case FLOAT: + return Types.FLOAT; + case DOUBLE: + return Types.DOUBLE; + case BOOLEAN: + return Types.BOOLEAN; + case NULL: + return Types.VOID; + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return data type matching the schema + */ + public static DataType convertToDataType(String avroSchemaString) { + return convertToDataType(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of local timestamps + * @return data type matching the schema + */ + public static DataType convertToDataType( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return convertToDataType(schema, legacyTimestampMapping); + } + + @SuppressWarnings("deprecation") + private static DataType convertToDataType(Schema schema, boolean legacyMapping) { + switch (schema.getType()) { + case RECORD: + final List schemaFields = schema.getFields(); + + final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; + for (int i = 0; i < schemaFields.size(); i++) { + final Schema.Field field = schemaFields.get(i); + fields[i] = + DataTypes.FIELD(field.name(), convertToDataType(field.schema(), legacyMapping)); + } + return DataTypes.ROW(fields).notNull(); + case ENUM: + return DataTypes.STRING().notNull(); + case ARRAY: + return DataTypes.ARRAY(convertToDataType(schema.getElementType(), legacyMapping)).notNull(); + case MAP: + return DataTypes.MAP( + DataTypes.STRING().notNull(), + convertToDataType(schema.getValueType(), legacyMapping)) + .notNull(); + case UNION: + final Schema actualSchema; + final boolean nullable; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + nullable = true; + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + nullable = true; + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + nullable = false; + } else { + // use Kryo for serialization + return new AtomicDataType( + new TypeInformationRawType<>(false, Types.GENERIC(Object.class))); + } + DataType converted = convertToDataType(actualSchema, legacyMapping); + return nullable ? converted.nullable() : converted; + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + // convert fixed size binary data to primitive byte arrays + return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); + case STRING: + // convert Avro's Utf8/CharSequence to String + return DataTypes.STRING().notNull(); + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + return DataTypes.BYTES().notNull(); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return DataTypes.DATE().notNull(); + } else if (logicalType == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } + return DataTypes.INT().notNull(); + case LONG: + if (legacyMapping) { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } + } else { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } + } + + return DataTypes.BIGINT().notNull(); + case FLOAT: + return DataTypes.FLOAT().notNull(); + case DOUBLE: + return DataTypes.DOUBLE().notNull(); + case BOOLEAN: + return DataTypes.BOOLEAN().notNull(); + case NULL: + return DataTypes.NULL(); + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema) { + return convertToSchema(schema, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @param legacyTimestampMapping whether to use the legacy timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema, boolean legacyTimestampMapping) { + return convertToSchema( + schema, "org.apache.flink.avro.generated.record", legacyTimestampMapping); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType logicalType, String rowName) { + return convertToSchema(logicalType, rowName, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @param legacyTimestampMapping whether to use legal timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema( + LogicalType logicalType, String rowName, boolean legacyTimestampMapping) { + int precision; + boolean nullable = logicalType.isNullable(); + switch (logicalType.getTypeRoot()) { + case NULL: + return SchemaBuilder.builder().nullType(); + case BOOLEAN: + Schema bool = SchemaBuilder.builder().booleanType(); + return nullable ? nullableSchema(bool) : bool; + case TINYINT: + case SMALLINT: + case INTEGER: + Schema integer = SchemaBuilder.builder().intType(); + return nullable ? nullableSchema(integer) : integer; + case BIGINT: + Schema bigint = SchemaBuilder.builder().longType(); + return nullable ? nullableSchema(bigint) : bigint; + case FLOAT: + Schema floatSchema = SchemaBuilder.builder().floatType(); + return nullable ? nullableSchema(floatSchema) : floatSchema; + case DOUBLE: + Schema doubleSchema = SchemaBuilder.builder().doubleType(); + return nullable ? nullableSchema(doubleSchema) : doubleSchema; + case CHAR: + case VARCHAR: + Schema str = SchemaBuilder.builder().stringType(); + return nullable ? nullableSchema(str) : str; + case BINARY: + case VARBINARY: + Schema binary = SchemaBuilder.builder().bytesType(); + return nullable ? nullableSchema(binary) : binary; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // use long to represents Timestamp + final TimestampType timestampType = (TimestampType) logicalType; + precision = timestampType.getPrecision(); + org.apache.avro.LogicalType avroLogicalType; + if (legacyTimestampMapping) { + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 3."); + } + } else { + if (precision <= 3) { + avroLogicalType = LogicalTypes.localTimestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.localTimestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support LOCAL TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + } + Schema timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } else { + final LocalZonedTimestampType localZonedTimestampType = + (LocalZonedTimestampType) logicalType; + precision = localZonedTimestampType.getPrecision(); + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.timestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + } + case DATE: + // use int to represents Date + Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(date) : date; + case TIME_WITHOUT_TIME_ZONE: + precision = ((TimeType) logicalType).getPrecision(); + if (precision > 3) { + throw new IllegalArgumentException( + "Avro does not support TIME type with precision: " + + precision + + ", it only supports precision less than 3."); + } + // use int to represents Time, we only support millisecond when deserialization + Schema time = LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(time) : time; + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + // store BigDecimal as byte[] + Schema decimal = + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema(SchemaBuilder.builder().bytesType()); + return nullable ? nullableSchema(decimal) : decimal; + case ROW: + RowType rowType = (RowType) logicalType; + List fieldNames = rowType.getFieldNames(); + // we have to make sure the record name is different in a Schema + SchemaBuilder.FieldAssembler builder = + SchemaBuilder.builder().record(rowName).fields(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + String fieldName = fieldNames.get(i); + LogicalType fieldType = rowType.getTypeAt(i); + SchemaBuilder.GenericDefault fieldBuilder = + builder + .name(fieldName) + .type( + convertToSchema( + fieldType, rowName + "_" + fieldName, legacyTimestampMapping)); + + if (fieldType.isNullable()) { + builder = fieldBuilder.withDefault(null); + } else { + builder = fieldBuilder.noDefault(); + } + } + Schema record = builder.endRecord(); + return nullable ? nullableSchema(record) : record; + case MULTISET: + case MAP: + Schema map = + SchemaBuilder.builder() + .map() + .values(convertToSchema(extractValueTypeToAvroMap(logicalType), rowName)); + return nullable ? nullableSchema(map) : map; + case ARRAY: + ArrayType arrayType = (ArrayType) logicalType; + Schema array = + SchemaBuilder.builder() + .array() + .items(convertToSchema(arrayType.getElementType(), rowName)); + return nullable ? nullableSchema(array) : array; + case RAW: + default: + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } + } + + public static LogicalType extractValueTypeToAvroMap(LogicalType type) { + LogicalType keyType; + LogicalType valueType; + if (type instanceof MapType) { + MapType mapType = (MapType) type; + keyType = mapType.getKeyType(); + valueType = mapType.getValueType(); + } else { + MultisetType multisetType = (MultisetType) type; + keyType = multisetType.getElementType(); + valueType = new IntType(); + } + if (!keyType.is(LogicalTypeFamily.CHARACTER_STRING)) { + throw new UnsupportedOperationException( + "Avro format doesn't support non-string as key type of map. " + + "The key type is: " + + keyType.asSummaryString()); + } + return valueType; + } + + /** Returns schema with nullable true. */ + private static Schema nullableSchema(Schema schema) { + return schema.isNullable() + ? schema + : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); + } +} diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java index f7e8e0c884cf..5f3494330cfc 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -21,14 +21,14 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.formats.avro.AvroToRowDataConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This util class converts Avro GenericRecord to Flink RowData.
      diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java index 8ef1f1fbb833..d74b8b9d620f 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -23,8 +23,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; @@ -32,6 +30,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This is not serializable because Avro {@link Schema} is not actually serializable, even though it diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java index b158b0871a53..cfef780a4daa 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java @@ -21,8 +21,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; @@ -31,6 +29,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; public class AvroGenericRecordConverter implements RowDataConverter { private final Schema avroSchema; diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java index e2cd411d7069..795c4fa5a766 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -75,6 +75,11 @@ public static class Primitives implements DataGenerator { OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_MAX_NANO = + OffsetDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_MAX_NANO = + LocalDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807); + private static final long ICEBERG_MAX_NANOS_EPOCH = 9223372036854775807L; private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); @@ -96,7 +101,11 @@ public static class Primitives implements DataGenerator { Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16)), + Types.NestedField.required( + 16, "ts_ns_with_zone_field", Types.TimestampNanoType.withZone()), + Types.NestedField.required( + 17, "ts_ns_without_zone_field", Types.TimestampNanoType.withoutZone())); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -171,6 +180,8 @@ public GenericRecord generateIcebergGenericRecord() { genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + genericRecord.setField("ts_ns_with_zone_field", JAVA_OFFSET_DATE_TIME_MAX_NANO); + genericRecord.setField("ts_ns_without_zone_field", JAVA_LOCAL_DATE_TIME_MAX_NANO); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -220,7 +231,11 @@ public GenericRowData generateFlinkRowData() { uuidBytes, binaryBytes, DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); + FIXED_BYTES, + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000)), + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000))); } @Override @@ -236,10 +251,12 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + // Now that AvroToRowDataConverters correctly supports microseconds, + // we must inject correct microsecond scale values into the Avro data. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_ns_with_zone_field", ICEBERG_MAX_NANOS_EPOCH); + genericRecord.put("ts_ns_without_zone_field", ICEBERG_MAX_NANOS_EPOCH); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -554,7 +571,11 @@ public static class ArrayOfPrimitive implements DataGenerator { new Schema( Types.NestedField.required(1, "row_id", Types.StringType.get()), Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "array_of_ts_ns", + Types.ListType.ofRequired(102, Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -581,13 +602,33 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + TimestampData[] tsArr = { + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), (int) Math.floorMod(posNanos, 1_000_000L)), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), (int) Math.floorMod(negNanos, 1_000_000L)) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(arr), + new GenericArrayData(tsArr)); } @Override @@ -595,6 +636,14 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } } @@ -808,7 +857,12 @@ public static class MapOfPrimitives implements DataGenerator { 2, "map_of_primitives", Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + 101, 102, Types.StringType.get(), Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "map_of_ts_ns", + Types.MapType.ofRequired( + 103, 104, Types.StringType.get(), Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -835,15 +889,37 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + return GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("positive"), + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), + (int) Math.floorMod(posNanos, 1_000_000L)), + StringData.fromString("negative"), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), + (int) Math.floorMod(negNanos, 1_000_000L))))); } @Override @@ -851,6 +927,15 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index cd6964b5ed0f..0e7635a33e87 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -30,7 +30,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.data.RandomRowData; import org.apache.iceberg.util.StructLikeWrapper; -import org.junit.jupiter.api.Disabled; public class TestRowDataWrapper extends RecordWrapperTestBase { @@ -60,18 +59,6 @@ public void testTime() { }); } - @Disabled - @Override - public void testTimestampNanoWithoutZone() { - // Flink does not support nanosecond timestamp without zone. - } - - @Disabled - @Override - public void testTimestampNanoWithZone() { - // Flink does not support nanosecond timestamp with zone. - } - @Override protected void generateAndValidate( Schema schema, RecordWrapperTestBase.AssertMethod assertMethod) { diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 4a70802f2a2e..b7b0a54156cc 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -49,6 +49,11 @@ protected boolean allowsWritingNullValuesForRequiredFields() { return true; } + @Override + protected boolean supportsTimestampNanos() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 4e5b38ffb026..a2411da1e344 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -271,18 +271,19 @@ public void testMapOfPrimitivesProjection() { GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2)), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); + GenericRowData.of(StringData.fromString("other_row_id_value"), null, null); testEqualsAndHashCode( schema, idOnly, @@ -432,7 +433,8 @@ public void testArrayOfPrimitiveProjection() { GenericRowData otherRowData = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); + new GenericArrayData(new Integer[] {4, 5, 6}), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); @@ -440,16 +442,19 @@ public void testArrayOfPrimitiveProjection() { GenericRowData rowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); GenericRowData copyRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); + new GenericArrayData(new Integer[] {4, null, 6}), + null); testEqualsAndHashCode( schema, idOnly, From 153237b568d6c1ddb6025a58c551e8e459164c71 Mon Sep 17 00:00:00 2001 From: pvary Date: Thu, 7 May 2026 13:30:23 +0200 Subject: [PATCH 175/197] Flink: Backport add Nanosecond Precision Support for Flink-Iceberg Integration to Flink 2.0 - missing changes (#16239) --- .../iceberg/flink/data/StructRowData.java | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index f2a20be331b6..b469f2310f42 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -121,8 +121,8 @@ public int getInt(int pos) { if (integer instanceof Integer) { return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalDate localDate) { + return (int) localDate.toEpochDay(); } else if (integer instanceof LocalTime) { return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); } else { @@ -278,10 +278,10 @@ private Object convertValue(Type elementType, Object value) { return value; case TIMESTAMP: long timeMillis; - if (value instanceof LocalDateTime) { - timeMillis = DateTimeUtil.microsFromTimestamp((LocalDateTime) value) / 1000L; - } else if (value instanceof OffsetDateTime) { - timeMillis = DateTimeUtil.microsFromTimestamptz((OffsetDateTime) value) / 1000L; + if (value instanceof LocalDateTime localDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamp(localDateTime) / 1000L; + } else if (value instanceof OffsetDateTime offsetDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamptz(offsetDateTime) / 1000L; } else { timeMillis = Math.floorDiv((Long) value, 1000L); } @@ -291,10 +291,10 @@ private Object convertValue(Type elementType, Object value) { * 1000); case TIMESTAMP_NANO: long nanoLong; - if (value instanceof LocalDateTime) { - nanoLong = DateTimeUtil.nanosFromTimestamp((LocalDateTime) value); - } else if (value instanceof OffsetDateTime) { - nanoLong = DateTimeUtil.nanosFromTimestamptz((OffsetDateTime) value); + if (value instanceof LocalDateTime localDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamp(localDateTime); + } else if (value instanceof OffsetDateTime offsetDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamptz(offsetDateTime); } else { nanoLong = (Long) value; } From 9ec1b933ed1d9253c6019773a624ba9b3d4d3c0a Mon Sep 17 00:00:00 2001 From: pvary Date: Thu, 7 May 2026 16:48:58 +0200 Subject: [PATCH 176/197] Spark: Backport support writing shredded variant in Iceberg-Spark (#16241) backports #14297 --- .../iceberg/spark/SparkSQLProperties.java | 8 + .../apache/iceberg/spark/SparkWriteConf.java | 30 + .../iceberg/spark/SparkWriteOptions.java | 6 + .../spark/source/SparkFormatModels.java | 4 +- .../source/SparkVariantShreddingAnalyzer.java | 69 ++ .../iceberg/spark/TestSparkWriteConf.java | 85 ++ .../spark/variant/TestVariantShredding.java | 1101 +++++++++++++++++ 7 files changed, 1302 insertions(+), 1 deletion(-) create mode 100644 spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java create mode 100644 spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java index b5b860214564..336aadd73c48 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkSQLProperties.java @@ -111,4 +111,12 @@ private SparkSQLProperties() {} // Prefix for custom snapshot properties public static final String SNAPSHOT_PROPERTY_PREFIX = "spark.sql.iceberg.snapshot-property."; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "spark.sql.iceberg.shred-variants"; + + // Controls the buffer size for variant schema inference during writes + // This determines how many rows are buffered before inferring shredded schema + public static final String VARIANT_INFERENCE_BUFFER_SIZE = + "spark.sql.iceberg.variant-inference-buffer-size"; } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java index aba7e4dda082..add12e6040b0 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java @@ -33,6 +33,8 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; +import static org.apache.iceberg.TableProperties.PARQUET_VARIANT_BUFFER_SIZE; import static org.apache.spark.sql.connector.write.RowLevelOperation.Command.DELETE; import java.util.Locale; @@ -529,6 +531,14 @@ private Map dataWriteProperties() { if (parquetCompressionLevel != null) { writeProperties.put(PARQUET_COMPRESSION_LEVEL, parquetCompressionLevel); } + boolean shouldShredVariants = shredVariants(); + writeProperties.put(PARQUET_SHRED_VARIANTS, String.valueOf(shouldShredVariants)); + + // Add variant shredding configuration properties + if (shouldShredVariants) { + writeProperties.put( + PARQUET_VARIANT_BUFFER_SIZE, String.valueOf(variantInferenceBufferSize())); + } break; case AVRO: @@ -749,4 +759,24 @@ public DeleteGranularity deleteGranularity() { .defaultValue(DeleteGranularity.FILE) .parse(); } + + public boolean shredVariants() { + return confParser + .booleanConf() + .option(SparkWriteOptions.SHRED_VARIANTS) + .sessionConf(SparkSQLProperties.SHRED_VARIANTS) + .tableProperty(TableProperties.PARQUET_SHRED_VARIANTS) + .defaultValue(TableProperties.PARQUET_SHRED_VARIANTS_DEFAULT) + .parse(); + } + + public int variantInferenceBufferSize() { + return confParser + .intConf() + .option(SparkWriteOptions.VARIANT_INFERENCE_BUFFER_SIZE) + .sessionConf(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE) + .tableProperty(TableProperties.PARQUET_VARIANT_BUFFER_SIZE) + .defaultValue(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT) + .parse(); + } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java index 1be02feaf0c0..6c76b5c873c5 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java @@ -86,4 +86,10 @@ private SparkWriteOptions() {} // Overrides the delete granularity public static final String DELETE_GRANULARITY = "delete-granularity"; + + // Controls whether to shred variant columns during write operations + public static final String SHRED_VARIANTS = "shred-variants"; + + // Controls the buffer size for variant schema inference during writes + public static final String VARIANT_INFERENCE_BUFFER_SIZE = "variant-inference-buffer-size"; } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java index 23fbe54a4be3..5b7862116aea 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -51,7 +51,9 @@ public static void register() { StructType.class, SparkParquetWriters::buildWriter, (icebergSchema, fileSchema, engineSchema, idToConstant) -> - SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); + SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant), + new SparkVariantShreddingAnalyzer(), + InternalRow::copy)); FormatModelRegistry.register( ParquetFormatModel.create( diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java new file mode 100644 index 000000000000..2c08c662c9da --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkVariantShreddingAnalyzer.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import org.apache.iceberg.parquet.VariantShreddingAnalyzer; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.variants.VariantMetadata; +import org.apache.iceberg.variants.VariantValue; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.unsafe.types.VariantVal; + +/** + * Spark-specific implementation that extracts variant values from {@link InternalRow} instances. + */ +class SparkVariantShreddingAnalyzer extends VariantShreddingAnalyzer { + + SparkVariantShreddingAnalyzer() {} + + @Override + protected int resolveColumnIndex(StructType sparkSchema, String columnName) { + try { + return sparkSchema.fieldIndex(columnName); + } catch (IllegalArgumentException e) { + return -1; + } + } + + @Override + protected List extractVariantValues( + List bufferedRows, int variantFieldIndex) { + List values = Lists.newArrayList(); + + for (InternalRow row : bufferedRows) { + if (!row.isNullAt(variantFieldIndex)) { + VariantVal variantVal = row.getVariant(variantFieldIndex); + if (variantVal != null) { + VariantValue variantValue = + VariantValue.from( + VariantMetadata.from( + ByteBuffer.wrap(variantVal.getMetadata()).order(ByteOrder.LITTLE_ENDIAN)), + ByteBuffer.wrap(variantVal.getValue()).order(ByteOrder.LITTLE_ENDIAN)); + values.add(variantValue); + } + } + } + + return values; + } +} diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java index c83b1b6e26ac..c5cfbe62b1be 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/TestSparkWriteConf.java @@ -34,6 +34,7 @@ import static org.apache.iceberg.TableProperties.ORC_COMPRESSION_STRATEGY; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION; import static org.apache.iceberg.TableProperties.PARQUET_COMPRESSION_LEVEL; +import static org.apache.iceberg.TableProperties.PARQUET_SHRED_VARIANTS; import static org.apache.iceberg.TableProperties.UPDATE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE; import static org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_HASH; @@ -61,6 +62,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.spark.sql.internal.SQLConf; +import org.apache.spark.sql.util.CaseInsensitiveStringMap; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.TestTemplate; @@ -340,6 +342,8 @@ public void testSparkConfOverride() { TableProperties.DELETE_PARQUET_COMPRESSION, "snappy"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -461,6 +465,8 @@ public void testDataPropsDefaultsAsDeleteProps() { PARQUET_COMPRESSION_LEVEL, "5"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -532,6 +538,8 @@ public void testDeleteFileWriteConf() { DELETE_PARQUET_COMPRESSION_LEVEL, "6"), ImmutableMap.of( + PARQUET_SHRED_VARIANTS, + "false", DELETE_PARQUET_COMPRESSION, "zstd", PARQUET_COMPRESSION, @@ -686,4 +694,81 @@ private void checkMode(DistributionMode expectedMode, SparkWriteConf writeConf) assertThat(writeConf.copyOnWriteDistributionMode(MERGE)).isEqualTo(expectedMode); assertThat(writeConf.positionDeltaDistributionMode(MERGE)).isEqualTo(expectedMode); } + + @TestTemplate + public void testShredVariantsDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.shredVariants()).isFalse(); + } + + @TestTemplate + public void testVariantInferenceBufferSizeDefault() { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.variantInferenceBufferSize()) + .isEqualTo(TableProperties.PARQUET_VARIANT_BUFFER_SIZE_DEFAULT); + } + + @TestTemplate + public void testVariantInferenceBufferSizeTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "500").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(500); + } + + @TestTemplate + public void testShredVariantsSessionOverridesTableProperty() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "false").commit(); + + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "true"), + () -> { + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testShredVariantsWriteOptionOverridesSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.SHRED_VARIANTS, "false"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = + new SparkWriteConf( + spark, + table, + new CaseInsensitiveStringMap( + ImmutableMap.of(SparkWriteOptions.SHRED_VARIANTS, "true"))); + assertThat(writeConf.shredVariants()).isTrue(); + }); + } + + @TestTemplate + public void testVariantInferenceBufferSizeSessionConf() { + withSQLConf( + ImmutableMap.of(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "250"), + () -> { + Table table = validationCatalog.loadTable(tableIdent); + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + assertThat(writeConf.variantInferenceBufferSize()).isEqualTo(250); + }); + } + + @TestTemplate + public void testWritePropertiesIncludeVariantShredding() { + Table table = validationCatalog.loadTable(tableIdent); + table.updateProperties().set(TableProperties.PARQUET_SHRED_VARIANTS, "true").commit(); + table.updateProperties().set(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200").commit(); + + SparkWriteConf writeConf = new SparkWriteConf(spark, table, ImmutableMap.of()); + Map writeProperties = writeConf.writeProperties(); + assertThat(writeProperties).containsEntry(PARQUET_SHRED_VARIANTS, "true"); + assertThat(writeProperties).containsEntry(TableProperties.PARQUET_VARIANT_BUFFER_SIZE, "200"); + } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java new file mode 100644 index 000000000000..8cdcf22e5817 --- /dev/null +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/variant/TestVariantShredding.java @@ -0,0 +1,1101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.variant; + +import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.METASTOREURIS; +import static org.apache.iceberg.TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES; +import static org.apache.parquet.schema.Types.optional; +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.math.BigDecimal; +import java.net.InetAddress; +import java.util.List; +import java.util.Map; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.iceberg.FileScanTask; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.spark.CatalogTestBase; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.apache.iceberg.spark.SparkSQLProperties; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.variants.Variant; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.analysis.NoSuchTableException; +import org.apache.spark.sql.internal.SQLConf; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.TestTemplate; + +public class TestVariantShredding extends CatalogTestBase { + + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get())); + + private static final Schema SCHEMA2 = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "address", Types.VariantType.get()), + Types.NestedField.optional(3, "metadata", Types.VariantType.get())); + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + protected static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + }, + }; + } + + @BeforeAll + public static void startMetastoreAndSpark() { + // First call parent to initialize metastore and spark with local[2] + CatalogTestBase.startMetastoreAndSpark(); + + // Now stop and recreate spark with local[1] to write all rows to a single file + if (spark != null) { + spark.stop(); + } + + spark = + SparkSession.builder() + .master("local[1]") // Use one thread to write the rows to a single parquet file + .config("spark.driver.host", InetAddress.getLoopbackAddress().getHostAddress()) + .config(SQLConf.PARTITION_OVERWRITE_MODE().key(), "dynamic") + .config("spark.hadoop." + METASTOREURIS.varname, hiveConf.get(METASTOREURIS.varname)) + .config("spark.sql.legacy.respectNullabilityInTextDatasetConversion", "true") + .config(DISABLE_UI) + .enableHiveSupport() + .getOrCreate(); + + sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } + + @BeforeEach + public void before() { + super.before(); + validationCatalog.createTable( + tableIdent, SCHEMA, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + } + + @AfterEach + public void after() { + spark.conf().unset(SparkSQLProperties.SHRED_VARIANTS); + spark.conf().unset(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE); + validationCatalog.dropTable(tableIdent, true); + } + + @TestTemplate + public void testVariantShreddingDisabled() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + String values = "(1, parse_json('{\"city\": \"NYC\", \"zip\": 10001}')), (2, null)"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testExcludingNullValue() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30, "dummy": null}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInconsistentType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"age": "25"}')),\ + (2, parse_json('{"age": 30}')),\ + (3, parse_json('{"age": "35"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT variant_get(address, '$.age', 'int') FROM %s WHERE id = 2", tableName); + assertThat(rows).hasSize(1); + assertThat(rows.get(0)[0]).isEqualTo(30); + } + + @TestTemplate + public void testPrimitiveType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = "(1, parse_json('123')), (2, parse_json('456')), (3, parse_json('789'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(16, true))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testPrimitiveDecimalType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + "(1, parse_json('123.56')), (2, parse_json('\"abc\"')), (3, parse_json('12.56'))"; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = + variant( + "address", + 2, + Type.Repetition.REQUIRED, + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testBooleanType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"active": true}')),\ + (2, parse_json('{"active": false}')),\ + (3, parse_json('{"active": true}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType active = field("active", shreddedPrimitive(PrimitiveType.PrimitiveTypeName.BOOLEAN)); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(active)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithInconsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.456789}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(6, 9))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalTypeWithConsistentScales() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"price": 123.45}')),\ + (2, parse_json('{"price": 678.90}')),\ + (3, parse_json('{"price": 999.99}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType price = + field( + "price", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.decimalType(2, 5))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(price)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('["java", "scala", "python"]')),\ + (2, parse_json('["rust", "go"]')),\ + (3, parse_json('["javascript"]'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType arr = + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType()))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, arr); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedArrayType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"tags": ["rust", "go"]}')),\ + (3, parse_json('{"tags": ["javascript"]}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testNestedObjectType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"location": {"city": "Seattle", "zip": 98101}, "tags": ["java", "scala", "python"]}')),\ + (2, parse_json('{"location": {"city": "Portland", "zip": 97201}}')),\ + (3, parse_json('{"location": {"city": "NYC", "zip": 10001}}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType zip = + field( + "zip", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(32, true))); + GroupType location = field("location", objectFields(city, zip)); + GroupType tags = + field( + "tags", + list( + element( + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, + LogicalTypeAnnotation.stringType())))); + + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(location, tags)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testLazyInitializationWithBufferedRows() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "5"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}')),\ + (4, parse_json('{"name": "David", "age": 28}')),\ + (5, parse_json('{"name": "Eve", "age": 32}')),\ + (6, parse_json('{"name": "Frank", "age": 40}')),\ + (7, parse_json('{"name": "Grace", "age": 27}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(7); + } + + @TestTemplate + public void testMultipleRowGroups() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int numRows = 1000; + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= numRows; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + valuesBuilder.append( + String.format("(%d, parse_json('{\"name\": \"User%d\", \"age\": %d}'))", i, i, 20 + i)); + } + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, PARQUET_ROW_GROUP_SIZE_BYTES, 1024); + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(numRows); + } + + @TestTemplate + public void testColumnIndexTruncateLength() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + int customTruncateLength = 10; + sql( + "ALTER TABLE %s SET TBLPROPERTIES('%s' '%d')", + tableName, "parquet.columnindex.truncate.length", customTruncateLength); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + String longValue = "A".repeat(20); + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"description\": \"%s\", \"id\": %d}'))", i, longValue, i)); + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + GroupType description = + field( + "description", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType id = + field( + "id", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = + variant("address", 2, Type.Repetition.REQUIRED, objectFields(description, id)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(10); + } + + @TestTemplate + public void testIntegerFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Mix of INT8, INT16, INT32, INT64 - should promote to INT64 + String values = + """ + (1, parse_json('{"value": 10}')),\ + (2, parse_json('{"value": 1000}')),\ + (3, parse_json('{"value": 100000}')),\ + (4, parse_json('{"value": 10000000000}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT64, LogicalTypeAnnotation.intType(64, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDecimalFamilyPromotion() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Test that they get promoted to the most capable decimal type observed + String values = + """ + (1, parse_json('{"value": 1.5}')),\ + (2, parse_json('{"value": 123.456789}')),\ + (3, parse_json('{"value": 123456789123456.789}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType value = + field( + "value", + optional(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) + .length(16) + .as(LogicalTypeAnnotation.decimalType(6, 21)) + .named("typed_value")); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(value)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testDataRoundTripWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, parse_json('{"name": "Bob", "age": 25}')),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify that we can read the data back correctly + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.age', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[0]).isEqualTo(1); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isEqualTo(30); + assertThat(rows.get(1)[0]).isEqualTo(2); + assertThat(rows.get(1)[1]).isEqualTo("Bob"); + assertThat(rows.get(1)[2]).isEqualTo(25); + assertThat(rows.get(2)[0]).isEqualTo(3); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + assertThat(rows.get(2)[2]).isEqualTo(35); + } + + @TestTemplate + public void testMultipleVariantsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + // Recreate table with SCHEMA2 (address + metadata variant columns) + validationCatalog.dropTable(tableIdent, true); + validationCatalog.createTable( + tableIdent, SCHEMA2, null, Map.of(TableProperties.FORMAT_VERSION, "3")); + + String values = + """ + (1, parse_json('{"city": "NYC"}'), parse_json('{"source": "web"}')),\ + (2, parse_json('{"city": "LA"}'), parse_json('{"source": "app"}')),\ + (3, parse_json('{"city": "SF"}'), parse_json('{"source": "api"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType city = + field( + "city", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(city)); + + GroupType source = + field( + "source", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType metadata = variant("metadata", 3, Type.Repetition.REQUIRED, objectFields(source)); + MessageType expectedSchema = parquetSchema(address, metadata); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testVariantWithNullValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('null')),\ + (2, parse_json('null')),\ + (3, parse_json('null'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testArrayOfNullElementsWithShredding() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql( + "INSERT INTO %s VALUES (1, parse_json('[null, null, null]')), " + + "(2, parse_json('[null]'))", + tableName); + + // Array elements are all null, element type is null, falls back to unshredded + GroupType address = variant("address", 2, Type.Repetition.REQUIRED); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedNullAndNonNullVariantValues() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + String values = + """ + (1, parse_json('{"name": "Alice", "age": 30}')),\ + (2, null),\ + (3, parse_json('{"name": "Charlie", "age": 35}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.OPTIONAL, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + long rowCount = spark.read().format("iceberg").load(tableName).count(); + assertThat(rowCount).isEqualTo(3); + } + + @TestTemplate + public void testWriteOptionOverridesSessionConfig() throws IOException, NoSuchTableException { + // Disable shredding at session level + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "false"); + + // Enable shredding via per-write option + String query = + "SELECT 1 as id, parse_json('{\"name\": \"Alice\", \"age\": 30}') as address" + + " UNION ALL SELECT 2, parse_json('{\"name\": \"Bob\", \"age\": 25}')" + + " UNION ALL SELECT 3, parse_json('{\"name\": \"Charlie\", \"age\": 35}')"; + spark.sql(query).writeTo(tableName).option("shred-variants", "true").append(); + + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testInfrequentFieldPruning() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "11"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 11; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i == 1) { + // Only the first row has rare_field + valuesBuilder.append( + String.format( + "(%d, parse_json('{\"name\": \"User%d\", \"rare_field\": \"rare\"}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"name\": \"User%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // rare_field appears in 1/11 rows, should be pruned + // name appears in 11/11 rows and should be kept + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + } + + @TestTemplate + public void testMixedTypeTieBreaking() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "10"); + + StringBuilder valuesBuilder = new StringBuilder(); + for (int i = 1; i <= 10; i++) { + if (i > 1) { + valuesBuilder.append(", "); + } + if (i <= 5) { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": %d}'))", i, i)); + } else { + valuesBuilder.append(String.format("(%d, parse_json('{\"val\": \"text%d\"}'))", i, i)); + } + } + sql("INSERT INTO %s VALUES %s", tableName, valuesBuilder.toString()); + + // 5 ints + 5 strings is a tie so STRING wins (higher TIE_BREAK_PRIORITY) + GroupType val = + field( + "val", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(val)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify data round-trips correctly + List rows = + sql("SELECT id, variant_get(address, '$.val', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(10); + assertThat(rows.get(0)[1]).isEqualTo("1"); + assertThat(rows.get(5)[1]).isEqualTo("text6"); + } + + @TestTemplate + public void testFieldOnlyAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + String values = + """ + (1, parse_json('{"name": "Alice"}')),\ + (2, parse_json('{"name": "Bob"}')),\ + (3, parse_json('{"name": "Charlie"}')),\ + (4, parse_json('{"name": "David", "score": 95}')),\ + (5, parse_json('{"name": "Eve", "score": 88}')),\ + (6, parse_json('{"name": "Frank", "score": 72}')),\ + (7, parse_json('{"name": "Grace", "score": 91}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + // Schema is determined from buffer (rows 1-3) which only has "name". + // "score" is not shredded + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + // Verify all data round-trips despite "score" not being shredded + List rows = + sql( + "SELECT id, variant_get(address, '$.name', 'string')," + + " variant_get(address, '$.score', 'int')" + + " FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(7); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(0)[2]).isNull(); + assertThat(rows.get(3)[1]).isEqualTo("David"); + assertThat(rows.get(3)[2]).isEqualTo(95); + assertThat(rows.get(6)[1]).isEqualTo("Grace"); + assertThat(rows.get(6)[2]).isEqualTo(91); + } + + @TestTemplate + public void testCrossFileDifferentShreddedType() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // File 1: "score" is always integer → shredded as INT8 + String batch1 = + """ + (1, parse_json('{"score": 95}')),\ + (2, parse_json('{"score": 88}')),\ + (3, parse_json('{"score": 72}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch1); + + // Verify file 1 schema: score shredded as INT8 + Table table = validationCatalog.loadTable(tableIdent); + GroupType scoreInt = + field( + "score", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + MessageType expectedSchema1 = + parquetSchema(variant("address", 2, Type.Repetition.REQUIRED, objectFields(scoreInt))); + verifyParquetSchema(table, expectedSchema1); + + // File 2: "score" is always string → shredded as STRING + String batch2 = + """ + (4, parse_json('{"score": "high"}')),\ + (5, parse_json('{"score": "medium"}')),\ + (6, parse_json('{"score": "low"}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, batch2); + + // Query across both files, reader must handle different shredded types + List rows = + sql("SELECT id, variant_get(address, '$.score', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo("95"); + assertThat(rows.get(1)[1]).isEqualTo("88"); + assertThat(rows.get(3)[1]).isEqualTo("high"); + assertThat(rows.get(5)[1]).isEqualTo("low"); + } + + @TestTemplate + public void testAllNullVariantColumn() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + + sql("INSERT INTO %s VALUES (1, null), (2, null), (3, null)", tableName); + + // All variant values are SQL NULL, so no shredding should occur + Table table = validationCatalog.loadTable(tableIdent); + MessageType expectedSchema = parquetSchema(variant("address", 2, Type.Repetition.OPTIONAL)); + verifyParquetSchema(table, expectedSchema); + + List rows = sql("SELECT id, address FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isNull(); + assertThat(rows.get(1)[1]).isNull(); + assertThat(rows.get(2)[1]).isNull(); + } + + @TestTemplate + public void testBufferSizeOne() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "1"); + + sql( + """ + INSERT INTO %s VALUES + (1, parse_json('{"name": "Alice", "age": 30}')), + (2, parse_json('{"name": "Bob", "age": 25}')), + (3, parse_json('{"name": "Charlie", "age": 35}')) + """, + tableName); + + // Schema inferred from first row only, should still shred name and age + GroupType age = + field( + "age", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.INT32, LogicalTypeAnnotation.intType(8, true))); + GroupType name = + field( + "name", + shreddedPrimitive( + PrimitiveType.PrimitiveTypeName.BINARY, LogicalTypeAnnotation.stringType())); + GroupType address = variant("address", 2, Type.Repetition.REQUIRED, objectFields(age, name)); + MessageType expectedSchema = parquetSchema(address); + + Table table = validationCatalog.loadTable(tableIdent); + verifyParquetSchema(table, expectedSchema); + + List rows = + sql("SELECT id, variant_get(address, '$.name', 'string') FROM %s ORDER BY id", tableName); + assertThat(rows).hasSize(3); + assertThat(rows.get(0)[1]).isEqualTo("Alice"); + assertThat(rows.get(2)[1]).isEqualTo("Charlie"); + } + + @TestTemplate + public void testDecimalFallbackAfterBuffer() throws IOException { + spark.conf().set(SparkSQLProperties.SHRED_VARIANTS, "true"); + spark.conf().set(SparkSQLProperties.VARIANT_INFERENCE_BUFFER_SIZE, "3"); + + // Buffer: scale=2, 3 integer digits -> DECIMAL(5,2) + // Row 4: precision overflow -> fallback to value field + // Row 5: scale overflow -> fallback to value field + // Row 6: fits typed column, scale widened from 1 to 2 via setScale + String values = + """ + (1, parse_json('{"val": 123.45}')),\ + (2, parse_json('{"val": 678.90}')),\ + (3, parse_json('{"val": 999.99}')),\ + (4, parse_json('{"val": 123456.78}')),\ + (5, parse_json('{"val": 1.2345}')),\ + (6, parse_json('{"val": 12.3}'))\ + """; + sql("INSERT INTO %s VALUES %s", tableName, values); + + List rows = + sql( + "SELECT id, variant_get(address, '$.val', 'decimal(10,4)') FROM %s ORDER BY id", + tableName); + assertThat(rows).hasSize(6); + assertThat(rows.get(0)[1]).isEqualTo(new BigDecimal("123.4500")); + assertThat(rows.get(3)[1]).isEqualTo(new BigDecimal("123456.7800")); + assertThat(rows.get(4)[1]).isEqualTo(new BigDecimal("1.2345")); + assertThat(rows.get(5)[1]).isEqualTo(new BigDecimal("12.3000")); + } + + private void verifyParquetSchema(Table table, MessageType expectedSchema) throws IOException { + try (CloseableIterable tasks = table.newScan().planFiles()) { + assertThat(tasks).isNotEmpty(); + + for (FileScanTask task : tasks) { + String path = task.file().location(); + + HadoopInputFile inputFile = HadoopInputFile.fromPath(new Path(path), new Configuration()); + + try (ParquetFileReader reader = ParquetFileReader.open(inputFile)) { + MessageType actualSchema = reader.getFileMetaData().getSchema(); + assertThat(actualSchema).isEqualTo(expectedSchema); + } + } + } + } + + private static MessageType parquetSchema(Type... variantTypes) { + return org.apache.parquet.schema.Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT32) + .id(1) + .named("id") + .addFields(variantTypes) + .named("table"); + } + + private static GroupType variant(String name, int fieldId, Type.Repetition repetition) { + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .named(name); + } + + private static GroupType variant( + String name, int fieldId, Type.Repetition repetition, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(repetition) + .id(fieldId) + .as(LogicalTypeAnnotation.variantType(Variant.VARIANT_SPEC_VERSION)) + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .named("metadata") + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static Type shreddedPrimitive(PrimitiveType.PrimitiveTypeName primitive) { + return optional(primitive).named("typed_value"); + } + + private static Type shreddedPrimitive( + PrimitiveType.PrimitiveTypeName primitive, LogicalTypeAnnotation annotation) { + return optional(primitive).as(annotation).named("typed_value"); + } + + private static GroupType objectFields(GroupType... fields) { + for (GroupType fieldType : fields) { + checkField(fieldType); + } + + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.OPTIONAL) + .addFields(fields) + .named("typed_value"); + } + + private static GroupType field(String name, Type shreddedType) { + checkShreddedType(shreddedType); + return org.apache.parquet.schema.Types.buildGroup(Type.Repetition.REQUIRED) + .optional(PrimitiveType.PrimitiveTypeName.BINARY) + .named("value") + .addField(shreddedType) + .named(name); + } + + private static GroupType element(Type shreddedType) { + return field("element", shreddedType); + } + + private static GroupType list(GroupType elementType) { + return org.apache.parquet.schema.Types.optionalList().element(elementType).named("typed_value"); + } + + private static void checkShreddedType(Type shreddedType) { + Preconditions.checkArgument( + shreddedType.getName().equals("typed_value"), + "Invalid shredded type name: %s should be typed_value", + shreddedType.getName()); + Preconditions.checkArgument( + shreddedType.isRepetition(Type.Repetition.OPTIONAL), + "Invalid shredded type repetition: %s should be OPTIONAL", + shreddedType.getRepetition()); + } + + private static void checkField(GroupType fieldType) { + Preconditions.checkArgument( + fieldType.isRepetition(Type.Repetition.REQUIRED), + "Invalid field type repetition: %s should be REQUIRED", + fieldType.getRepetition()); + } +} From f767dad20e9a47495ce37f1acf6acdffbf63664d Mon Sep 17 00:00:00 2001 From: pvary Date: Thu, 7 May 2026 16:49:25 +0200 Subject: [PATCH 177/197] Flink: Backport add Nanosecond Precision Support for Flink-Iceberg Integration to Flink 1.20 (#16240) Backports #15475 --- flink/v1.20/build.gradle | 1 + .../apache/iceberg/flink/FlinkTypeToType.java | 6 + .../apache/iceberg/flink/RowDataWrapper.java | 36 +- .../iceberg/flink/data/FlinkOrcReader.java | 7 + .../iceberg/flink/data/FlinkOrcWriter.java | 7 + .../iceberg/flink/data/FlinkOrcWriters.java | 37 ++ .../iceberg/flink/data/RowDataUtil.java | 2 + .../iceberg/flink/data/StructRowData.java | 52 +- .../formats/avro/AvroToRowDataConverters.java | 303 +++++++++ .../flink/formats/avro/JodaConverter.java | 69 ++ .../formats/avro/RowDataToAvroConverters.java | 394 +++++++++++ .../avro/typeutils/AvroSchemaConverter.java | 625 ++++++++++++++++++ .../AvroGenericRecordToRowDataMapper.java | 4 +- .../RowDataToAvroGenericRecordConverter.java | 4 +- .../reader/AvroGenericRecordConverter.java | 4 +- .../apache/iceberg/flink/DataGenerators.java | 105 ++- .../iceberg/flink/TestRowDataWrapper.java | 13 - .../flink/data/TestFlinkOrcReaderWriter.java | 5 + .../flink/data/TestRowDataProjection.java | 21 +- 19 files changed, 1642 insertions(+), 53 deletions(-) create mode 100644 flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java create mode 100644 flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java create mode 100644 flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java create mode 100644 flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index c7ca24817bc9..467b0fa8c9be 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -33,6 +33,7 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { implementation project(':iceberg-hive-metastore') compileOnly libs.flink120.avro + compileOnly libs.joda.time // dropwizard histogram metrics (optional in Flink) compileOnly libs.flink120.metrics.dropwizard compileOnly libs.flink120.streaming.java diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java index 408065f06057..8f106da8d56b 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/FlinkTypeToType.java @@ -137,11 +137,17 @@ public Type visit(TimeType timeType) { @Override public Type visit(TimestampType timestampType) { + if (timestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withoutZone(); + } return Types.TimestampType.withoutZone(); } @Override public Type visit(LocalZonedTimestampType localZonedTimestampType) { + if (localZonedTimestampType.getPrecision() > 6) { + return Types.TimestampNanoType.withZone(); + } return Types.TimestampType.withZone(); } diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java index 3ef611f2ded5..920e44b24b31 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/RowDataWrapper.java @@ -114,19 +114,35 @@ private static PositionalGetter buildGetter(LogicalType logicalType, Type typ case TIMESTAMP_WITHOUT_TIME_ZONE: TimestampType timestampType = (TimestampType) logicalType; - return (row, pos) -> { - LocalDateTime localDateTime = - row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); - return DateTimeUtil.microsFromTimestamp(localDateTime); - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.nanosFromTimestamp(localDateTime); + }; + } else { + return (row, pos) -> { + LocalDateTime localDateTime = + row.getTimestamp(pos, timestampType.getPrecision()).toLocalDateTime(); + return DateTimeUtil.microsFromTimestamp(localDateTime); + }; + } case TIMESTAMP_WITH_LOCAL_TIME_ZONE: LocalZonedTimestampType lzTs = (LocalZonedTimestampType) logicalType; - return (row, pos) -> { - TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); - return timestampData.getMillisecond() * 1000 - + timestampData.getNanoOfMillisecond() / 1000; - }; + if (type.typeId() == Type.TypeID.TIMESTAMP_NANO) { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + }; + } else { + return (row, pos) -> { + TimestampData timestampData = row.getTimestamp(pos, lzTs.getPrecision()); + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + }; + } case ROW: RowType rowType = (RowType) logicalType; diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 65b9d44ad4b8..3e3a29112cf4 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -112,6 +112,13 @@ public OrcValueReader primitive(Type.PrimitiveType iPrimitive, TypeDescriptio } else { return FlinkOrcReaders.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcReaders.timestampTzs(); + } else { + return FlinkOrcReaders.timestamps(); + } case STRING: return FlinkOrcReaders.strings(); case UUID: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java index a467d848337d..c1b46252e18a 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriter.java @@ -145,6 +145,13 @@ public OrcValueWriter primitive(Type.PrimitiveType iPrimitive, LogicalType fl } else { return FlinkOrcWriters.timestamps(); } + case TIMESTAMP_NANO: + Types.TimestampNanoType timestampNanoType = (Types.TimestampNanoType) iPrimitive; + if (timestampNanoType.shouldAdjustToUTC()) { + return FlinkOrcWriters.timestampNanoTzs(); + } else { + return FlinkOrcWriters.timestampNanos(); + } case STRING: return FlinkOrcWriters.strings(); case UUID: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java index 684842aa099c..bf19a46c05fb 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcWriters.java @@ -70,6 +70,14 @@ static OrcValueWriter timestampTzs() { return TimestampTzWriter.INSTANCE; } + static OrcValueWriter timestampNanos() { + return TimestampNanoWriter.INSTANCE; + } + + static OrcValueWriter timestampNanoTzs() { + return TimestampNanoTzWriter.INSTANCE; + } + static OrcValueWriter decimals(int precision, int scale) { if (precision <= 18) { return new Decimal18Writer(precision, scale); @@ -170,6 +178,35 @@ public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { } } + private static class TimestampNanoWriter implements OrcValueWriter { + private static final TimestampNanoWriter INSTANCE = new TimestampNanoWriter(); + + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + cv.setIsUTC(true); + // millis + OffsetDateTime offsetDateTime = data.toInstant().atOffset(ZoneOffset.UTC); + cv.time[rowId] = + offsetDateTime.toEpochSecond() * 1_000 + offsetDateTime.getNano() / 1_000_000; + cv.nanos[rowId] = offsetDateTime.getNano(); + } + } + + private static class TimestampNanoTzWriter implements OrcValueWriter { + private static final TimestampNanoTzWriter INSTANCE = new TimestampNanoTzWriter(); + + @SuppressWarnings("JavaInstantGetSecondsGetNano") + @Override + public void nonNullWrite(int rowId, TimestampData data, ColumnVector output) { + TimestampColumnVector cv = (TimestampColumnVector) output; + // millis + Instant instant = data.toInstant(); + cv.time[rowId] = instant.toEpochMilli(); + cv.nanos[rowId] = instant.getNano(); + } + } + private static class Decimal18Writer implements OrcValueWriter { private final int precision; private final int scale; diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java index f23a7ee3d0d3..81bb55967992 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/RowDataUtil.java @@ -69,6 +69,8 @@ public static Object convertConstant(Type type, Object value) { return (int) ((Long) value / 1000); case TIMESTAMP: // TimestampData return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromMicros((Long) value)); + case TIMESTAMP_NANO: + return TimestampData.fromLocalDateTime(DateTimeUtil.timestampFromNanos((Long) value)); case UUID: return UUIDUtil.convert((UUID) value); default: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java index 34576a1e5c0b..b469f2310f42 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/StructRowData.java @@ -48,6 +48,7 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.ByteBuffers; +import org.apache.iceberg.util.DateTimeUtil; @Internal public class StructRowData implements RowData { @@ -120,8 +121,8 @@ public int getInt(int pos) { if (integer instanceof Integer) { return (int) integer; - } else if (integer instanceof LocalDate) { - return (int) ((LocalDate) integer).toEpochDay(); + } else if (integer instanceof LocalDate localDate) { + return (int) localDate.toEpochDay(); } else if (integer instanceof LocalTime) { return (int) (((LocalTime) integer).toNanoOfDay() / 1000_000); } else { @@ -185,8 +186,27 @@ private BigDecimal getDecimalInternal(int pos) { @Override public TimestampData getTimestamp(int pos, int precision) { + if (precision > 6) { + Object timeVal = struct.get(pos, Object.class); + if (timeVal instanceof OffsetDateTime) { + OffsetDateTime odt = (OffsetDateTime) timeVal; + return TimestampData.fromEpochMillis( + odt.toInstant().toEpochMilli(), odt.getNano() % 1_000_000); + } else if (timeVal instanceof LocalDateTime) { + LocalDateTime ldt = (LocalDateTime) timeVal; + return TimestampData.fromEpochMillis( + ldt.toInstant(ZoneOffset.UTC).toEpochMilli(), ldt.getNano() % 1_000_000); + } else if (timeVal instanceof Long) { + long timeLong = (Long) timeVal; + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } else { + throw new IllegalStateException("Unknown type for timestamp_ns: " + timeVal.getClass()); + } + } long timeLong = getLong(pos); - return TimestampData.fromEpochMillis(timeLong / 1000, (int) (timeLong % 1000) * 1000); + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1000); } @Override @@ -257,9 +277,29 @@ private Object convertValue(Type elementType, Object value) { case DECIMAL: return value; case TIMESTAMP: - long millisecond = (long) value / 1000; - int nanoOfMillisecond = (int) ((Long) value % 1000) * 1000; - return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond); + long timeMillis; + if (value instanceof LocalDateTime localDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamp(localDateTime) / 1000L; + } else if (value instanceof OffsetDateTime offsetDateTime) { + timeMillis = DateTimeUtil.microsFromTimestamptz(offsetDateTime) / 1000L; + } else { + timeMillis = Math.floorDiv((Long) value, 1000L); + } + return TimestampData.fromEpochMillis( + timeMillis, + (int) Math.floorMod(value instanceof Long ? (Long) value : timeMillis * 1000L, 1000L) + * 1000); + case TIMESTAMP_NANO: + long nanoLong; + if (value instanceof LocalDateTime localDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamp(localDateTime); + } else if (value instanceof OffsetDateTime offsetDateTime) { + nanoLong = DateTimeUtil.nanosFromTimestamptz(offsetDateTime); + } else { + nanoLong = (Long) value; + } + return TimestampData.fromEpochMillis( + Math.floorDiv(nanoLong, 1_000_000L), (int) Math.floorMod(nanoLong, 1_000_000L)); case STRING: return StringData.fromString(value.toString()); case FIXED: diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java new file mode 100644 index 000000000000..0f70e60a1b9f --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/AvroToRowDataConverters.java @@ -0,0 +1,303 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.lang.reflect.Array; +import java.nio.ByteBuffer; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.time.temporal.ChronoField; +import java.util.List; +import java.util.Map; +import org.apache.avro.generic.GenericFixed; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.generic.IndexedRecord; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.utils.LogicalTypeUtils; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; + +/** + * Tool class used to convert from Avro {@link GenericRecord} to {@link RowData}. + * + *

      This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class AvroToRowDataConverters { + + private AvroToRowDataConverters() {} + + /** + * Runtime converter that converts Avro data structures into objects of Flink Table & SQL + * internal data structures. + */ + @FunctionalInterface + public interface AvroToRowDataConverter extends Serializable { + Object convert(Object object); + } + + // ------------------------------------------------------------------------------------- + // Runtime Converters + // ------------------------------------------------------------------------------------- + + public static AvroToRowDataConverter createRowConverter(RowType rowType) { + return createRowConverter(rowType, true); + } + + public static AvroToRowDataConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter[] fieldConverters = + rowType.getFields().stream() + .map(RowType.RowField::getType) + .map(type -> createNullableConverter(type, legacyTimestampMapping)) + .toArray(AvroToRowDataConverter[]::new); + final int arity = rowType.getFieldCount(); + + return avroObject -> { + IndexedRecord record = (IndexedRecord) avroObject; + GenericRowData row = new GenericRowData(arity); + for (int i = 0; i < arity; ++i) { + // avro always deserialize successfully even though the type isn't matched + // so no need to throw exception about which field can't be deserialized + row.setField(i, fieldConverters[i].convert(record.get(i))); + } + return row; + }; + } + + /** Creates a runtime converter which is null safe. */ + private static AvroToRowDataConverter createNullableConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter converter = createConverter(type, legacyTimestampMapping); + return avroObject -> { + if (avroObject == null) { + return null; + } + return converter.convert(avroObject); + }; + } + + /** Creates a runtime converter which assuming input object is not null. */ + private static AvroToRowDataConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + switch (type.getTypeRoot()) { + case NULL: + return avroObject -> null; + case TINYINT: + return avroObject -> ((Integer) avroObject).byteValue(); + case SMALLINT: + return avroObject -> ((Integer) avroObject).shortValue(); + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + return avroObject -> avroObject; + case DATE: + return AvroToRowDataConverters::convertToDate; + case TIME_WITHOUT_TIME_ZONE: + return AvroToRowDataConverters::convertToTime; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return avroObject -> convertToTimestamp(avroObject, type); + } + case CHAR: + case VARCHAR: + return avroObject -> StringData.fromString(avroObject.toString()); + case BINARY: + case VARBINARY: + return AvroToRowDataConverters::convertToBytes; + case DECIMAL: + return createDecimalConverter((DecimalType) type); + case ARRAY: + return createArrayConverter((ArrayType) type, legacyTimestampMapping); + case ROW: + return createRowConverter((RowType) type); + case MAP: + case MULTISET: + return createMapConverter(type, legacyTimestampMapping); + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + private static AvroToRowDataConverter createDecimalConverter(DecimalType decimalType) { + final int precision = decimalType.getPrecision(); + final int scale = decimalType.getScale(); + return avroObject -> { + final byte[] bytes; + if (avroObject instanceof GenericFixed) { + bytes = ((GenericFixed) avroObject).bytes(); + } else if (avroObject instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) avroObject; + bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + } else { + bytes = (byte[]) avroObject; + } + return DecimalData.fromUnscaledBytes(bytes, precision, scale); + }; + } + + private static AvroToRowDataConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + final AvroToRowDataConverter elementConverter = + createNullableConverter(arrayType.getElementType(), legacyTimestampMapping); + final Class elementClass = + LogicalTypeUtils.toInternalConversionClass(arrayType.getElementType()); + + return avroObject -> { + final List list = (List) avroObject; + final int length = list.size(); + final Object[] array = (Object[]) Array.newInstance(elementClass, length); + for (int i = 0; i < length; ++i) { + array[i] = elementConverter.convert(list.get(i)); + } + return new GenericArrayData(array); + }; + } + + private static AvroToRowDataConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + final AvroToRowDataConverter keyConverter = + createConverter(DataTypes.STRING().getLogicalType(), legacyTimestampMapping); + final AvroToRowDataConverter valueConverter = + createNullableConverter( + AvroSchemaConverter.extractValueTypeToAvroMap(type), legacyTimestampMapping); + + return avroObject -> { + final Map map = (Map) avroObject; + Map result = Maps.newHashMap(); + for (Map.Entry entry : map.entrySet()) { + Object key = keyConverter.convert(entry.getKey()); + Object value = valueConverter.convert(entry.getValue()); + result.put(key, value); + } + return new GenericMapData(result); + }; + } + + private static TimestampData convertToTimestamp(Object object, LogicalType type) { + int precision = 3; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + precision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + precision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } + + if (object instanceof Long) { + long timeLong = (Long) object; + if (precision <= 3) { + return TimestampData.fromEpochMillis(timeLong); + } else if (precision <= 6) { + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1000L), (int) Math.floorMod(timeLong, 1000L) * 1_000_000); + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return TimestampData.fromEpochMillis( + Math.floorDiv(timeLong, 1_000_000L), (int) Math.floorMod(timeLong, 1_000_000L)); + } + } else if (object instanceof Instant) { + return TimestampData.fromInstant((Instant) object); + } else if (object instanceof LocalDateTime) { + return TimestampData.fromLocalDateTime((LocalDateTime) object); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return TimestampData.fromEpochMillis(jodaConverter.convertTimestamp(object)); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIMESTAMP logical type. Received: " + object); + } + } + } + + private static int convertToDate(Object object) { + if (object instanceof Integer) { + return (Integer) object; + } else if (object instanceof LocalDate) { + return (int) ((LocalDate) object).toEpochDay(); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + return (int) jodaConverter.convertDate(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for DATE logical type. Received: " + object); + } + } + } + + private static int convertToTime(Object object) { + final int millis; + if (object instanceof Integer) { + millis = (Integer) object; + } else if (object instanceof LocalTime) { + millis = ((LocalTime) object).get(ChronoField.MILLI_OF_DAY); + } else { + JodaConverter jodaConverter = JodaConverter.getConverter(); + if (jodaConverter != null) { + millis = jodaConverter.convertTime(object); + } else { + throw new IllegalArgumentException( + "Unexpected object type for TIME logical type. Received: " + object); + } + } + return millis; + } + + private static byte[] convertToBytes(Object object) { + if (object instanceof GenericFixed) { + return ((GenericFixed) object).bytes(); + } else if (object instanceof ByteBuffer) { + ByteBuffer byteBuffer = (ByteBuffer) object; + byte[] bytes = new byte[byteBuffer.remaining()]; + byteBuffer.get(bytes); + return bytes; + } else { + return (byte[]) object; + } + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java new file mode 100644 index 000000000000..c30b78023345 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/JodaConverter.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import org.joda.time.DateTime; +import org.joda.time.DateTimeFieldType; +import org.joda.time.LocalDate; +import org.joda.time.LocalTime; + +/** + * Encapsulates joda optional dependency. Instantiates this class only if joda is available on the + * classpath. + */ +@SuppressWarnings("JavaUtilDate") +class JodaConverter { + + private static JodaConverter instance; + private static boolean instantiated = false; + + public static JodaConverter getConverter() { + if (instantiated) { + return instance; + } + + try { + Class.forName( + "org.joda.time.DateTime", false, Thread.currentThread().getContextClassLoader()); + instance = new JodaConverter(); + } catch (ClassNotFoundException e) { + instance = null; + } finally { + instantiated = true; + } + return instance; + } + + public long convertDate(Object object) { + final LocalDate value = (LocalDate) object; + return value.toDate().getTime(); + } + + public int convertTime(Object object) { + final LocalTime value = (LocalTime) object; + return value.get(DateTimeFieldType.millisOfDay()); + } + + public long convertTimestamp(Object object) { + final DateTime value = (DateTime) object; + return value.toDate().getTime(); + } + + private JodaConverter() {} +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java new file mode 100644 index 000000000000..d4c7e4282d6e --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/RowDataToAvroConverters.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.time.ZoneOffset; +import java.util.List; +import java.util.Map; +import org.apache.avro.Schema; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.util.Utf8; +import org.apache.flink.annotation.Internal; +import org.apache.flink.table.data.ArrayData; +import org.apache.flink.table.data.DecimalData; +import org.apache.flink.table.data.MapData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.CollectionUtil; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +/** + * Tool class used to convert from {@link RowData} to Avro {@link GenericRecord}. + * + *

      This class is adapted in Iceberg to add support for nanosecond precision timestamps + * (FLINK-39251). Once that ticket is resolved in Flink, this custom converter may be removed. + */ +@Internal +public class RowDataToAvroConverters { + + private RowDataToAvroConverters() {} + + // -------------------------------------------------------------------------------- + // Runtime Converters + // -------------------------------------------------------------------------------- + + /** + * Runtime converter that converts objects of Flink Table & SQL internal data structures to + * corresponding Avro data structures. + */ + @FunctionalInterface + public interface RowDataToAvroConverter extends Serializable { + Object convert(Schema schema, Object object); + } + + // -------------------------------------------------------------------------------- + // IMPORTANT! We use anonymous classes instead of lambdas for a reason here. It is + // necessary because the maven shade plugin cannot relocate classes in + // SerializedLambdas (MSHADE-260). On the other hand we want to relocate Avro for + // sql-client uber jars. + // -------------------------------------------------------------------------------- + + /** + * Creates a runtime converter according to the given logical type that converts objects of Flink + * Table & SQL internal data structures to corresponding Avro data structures. + */ + public static RowDataToAvroConverter createConverter(LogicalType type) { + return createConverter(type, true); + } + + @SuppressWarnings("checkstyle:MethodLength") + public static RowDataToAvroConverter createConverter( + LogicalType type, boolean legacyTimestampMapping) { + final RowDataToAvroConverter converter; + switch (type.getTypeRoot()) { + case NULL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return null; + } + }; + break; + case TINYINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Byte) object).intValue(); + } + }; + break; + case SMALLINT: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ((Short) object).intValue(); + } + }; + break; + case BOOLEAN: // boolean + case INTEGER: // int + case INTERVAL_YEAR_MONTH: // long + case BIGINT: // long + case INTERVAL_DAY_TIME: // long + case FLOAT: // float + case DOUBLE: // double + case TIME_WITHOUT_TIME_ZONE: // int + case DATE: // int + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return object; + } + }; + break; + case CHAR: + case VARCHAR: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return new Utf8(object.toString()); + } + }; + break; + case BINARY: + case VARBINARY: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap((byte[]) object); + } + }; + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int tzPrecision; + if (type instanceof org.apache.flink.table.types.logical.TimestampType) { + tzPrecision = ((org.apache.flink.table.types.logical.TimestampType) type).getPrecision(); + } else { + tzPrecision = 3; + } + if (legacyTimestampMapping) { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (tzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (tzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + java.time.Instant instant = + timestampData.toLocalDateTime().toInstant(ZoneOffset.UTC); + if (tzPrecision <= 3) { + return instant.toEpochMilli(); + } else if (tzPrecision <= 6) { + return instant.getEpochSecond() * 1_000_000L + instant.getNano() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return instant.getEpochSecond() * 1_000_000_000L + instant.getNano(); + } + } + }; + } + break; + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int ltzPrecision; + if (type instanceof org.apache.flink.table.types.logical.LocalZonedTimestampType) { + ltzPrecision = + ((org.apache.flink.table.types.logical.LocalZonedTimestampType) type).getPrecision(); + } else { + ltzPrecision = 3; + } + if (legacyTimestampMapping) { + throw new UnsupportedOperationException("Unsupported type: " + type); + } else { + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + TimestampData timestampData = (TimestampData) object; + if (ltzPrecision <= 3) { + return timestampData.getMillisecond(); + } else if (ltzPrecision <= 6) { + return timestampData.getMillisecond() * 1000L + + timestampData.getNanoOfMillisecond() / 1000; + } else { + // Iceberg: Added support for nanoseconds precision (FLINK-39251) + return timestampData.getMillisecond() * 1_000_000L + + timestampData.getNanoOfMillisecond(); + } + } + }; + } + break; + case DECIMAL: + converter = + new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + return ByteBuffer.wrap(((DecimalData) object).toUnscaledBytes()); + } + }; + break; + case ARRAY: + converter = createArrayConverter((ArrayType) type, legacyTimestampMapping); + break; + case ROW: + converter = createRowConverter((RowType) type, legacyTimestampMapping); + break; + case MAP: + case MULTISET: + converter = createMapConverter(type, legacyTimestampMapping); + break; + case RAW: + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + + // wrap into nullable converter + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + if (object == null) { + return null; + } + + // get actual schema if it is a nullable schema + Schema actualSchema; + if (schema.getType() == Schema.Type.UNION) { + List types = schema.getTypes(); + int size = types.size(); + if (size == 2 && types.get(1).getType() == Schema.Type.NULL) { + actualSchema = types.get(0); + } else if (size == 2 && types.get(0).getType() == Schema.Type.NULL) { + actualSchema = types.get(1); + } else { + throw new IllegalArgumentException( + "The Avro schema is not a nullable type: " + schema.toString()); + } + } else { + actualSchema = schema; + } + return converter.convert(actualSchema, object); + } + }; + } + + private static RowDataToAvroConverter createRowConverter( + RowType rowType, boolean legacyTimestampMapping) { + final RowDataToAvroConverter[] fieldConverters = + rowType.getChildren().stream() + .map(legacyType -> createConverter(legacyType, legacyTimestampMapping)) + .toArray(RowDataToAvroConverter[]::new); + final LogicalType[] fieldTypes = + rowType.getFields().stream().map(RowType.RowField::getType).toArray(LogicalType[]::new); + final RowData.FieldGetter[] fieldGetters = new RowData.FieldGetter[fieldTypes.length]; + for (int i = 0; i < fieldTypes.length; i++) { + fieldGetters[i] = RowData.createFieldGetter(fieldTypes[i], i); + } + final int length = rowType.getFieldCount(); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final RowData row = (RowData) object; + final List fields = schema.getFields(); + final GenericRecord record = new GenericData.Record(schema); + for (int i = 0; i < length; ++i) { + final Schema.Field schemaField = fields.get(i); + try { + Object avroObject = + fieldConverters[i].convert( + schemaField.schema(), fieldGetters[i].getFieldOrNull(row)); + record.put(i, avroObject); + } catch (Throwable t) { + throw new RuntimeException( + String.format("Fail to serialize at field: %s.", schemaField.name()), t); + } + } + return record; + } + }; + } + + private static RowDataToAvroConverter createArrayConverter( + ArrayType arrayType, boolean legacyTimestampMapping) { + LogicalType elementType = arrayType.getElementType(); + final ArrayData.ElementGetter elementGetter = ArrayData.createElementGetter(elementType); + final RowDataToAvroConverter elementConverter = + createConverter(arrayType.getElementType(), legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema elementSchema = schema.getElementType(); + ArrayData arrayData = (ArrayData) object; + List list = Lists.newArrayList(); + for (int i = 0; i < arrayData.size(); ++i) { + list.add( + elementConverter.convert( + elementSchema, elementGetter.getElementOrNull(arrayData, i))); + } + return list; + } + }; + } + + private static RowDataToAvroConverter createMapConverter( + LogicalType type, boolean legacyTimestampMapping) { + LogicalType valueType = AvroSchemaConverter.extractValueTypeToAvroMap(type); + final ArrayData.ElementGetter valueGetter = ArrayData.createElementGetter(valueType); + final RowDataToAvroConverter valueConverter = + createConverter(valueType, legacyTimestampMapping); + + return new RowDataToAvroConverter() { + private static final long serialVersionUID = 1L; + + @Override + public Object convert(Schema schema, Object object) { + final Schema valueSchema = schema.getValueType(); + final MapData mapData = (MapData) object; + final ArrayData keyArray = mapData.keyArray(); + final ArrayData valueArray = mapData.valueArray(); + final Map map = CollectionUtil.newHashMapWithExpectedSize(mapData.size()); + for (int i = 0; i < mapData.size(); ++i) { + final String key = keyArray.getString(i).toString(); + final Object value = + valueConverter.convert(valueSchema, valueGetter.getElementOrNull(valueArray, i)); + map.put(key, value); + } + return map; + } + }; + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java new file mode 100644 index 000000000000..347631c7f451 --- /dev/null +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/formats/avro/typeutils/AvroSchemaConverter.java @@ -0,0 +1,625 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.formats.avro.typeutils; + +import java.util.List; +import org.apache.avro.LogicalTypes; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.specific.SpecificData; +import org.apache.avro.specific.SpecificRecord; +import org.apache.flink.api.common.typeinfo.TypeInformation; +import org.apache.flink.api.common.typeinfo.Types; +import org.apache.flink.api.java.typeutils.RowTypeInfo; +import org.apache.flink.formats.avro.AvroRowDataDeserializationSchema; +import org.apache.flink.formats.avro.AvroRowDataSerializationSchema; +import org.apache.flink.table.api.DataTypes; +import org.apache.flink.table.types.AtomicDataType; +import org.apache.flink.table.types.DataType; +import org.apache.flink.table.types.logical.ArrayType; +import org.apache.flink.table.types.logical.DecimalType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.LocalZonedTimestampType; +import org.apache.flink.table.types.logical.LogicalType; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.MapType; +import org.apache.flink.table.types.logical.MultisetType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.logical.TimeType; +import org.apache.flink.table.types.logical.TimestampType; +import org.apache.flink.table.types.logical.TypeInformationRawType; +import org.apache.flink.types.Row; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * Converts an Avro schema into Flink's type information. It uses {@link RowTypeInfo} for + * representing objects and converts Avro types into types that are compatible with Flink's Table + * & SQL API. + * + *

      Note: Changes in this class need to be kept in sync with the corresponding runtime classes + * {@link AvroRowDataDeserializationSchema} and {@link AvroRowDataSerializationSchema}. + * + *

      This class is adapted in Iceberg to support custom 'timestamp-nanos' and + * 'local-timestamp-nanos' logical types (FLINK-39251). Once that ticket is resolved in Flink, these + * custom types may be removed. + */ +public class AvroSchemaConverter { + + private AvroSchemaConverter() { + // private + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass) { + return convertToTypeInfo(avroClass, true); + } + + /** + * Converts an Avro class into a nested row structure with deterministic field order and data + * types that are compatible with Flink's Table & SQL API. + * + * @param avroClass Avro specific record that contains schema information + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + Class avroClass, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroClass, "Avro specific record class must not be null."); + // determine schema to retrieve deterministic field order + final Schema schema = SpecificData.get().getSchema(avroClass); + return (TypeInformation) convertToTypeInfo(schema, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo(String avroSchemaString) { + return convertToTypeInfo(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of timestamp types + * @return type information matching the schema + */ + @SuppressWarnings("unchecked") + public static TypeInformation convertToTypeInfo( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return (TypeInformation) convertToTypeInfo(schema, legacyTimestampMapping); + } + + private static TypeInformation convertToTypeInfo( + Schema schema, boolean legacyTimestampMapping) { + switch (schema.getType()) { + case RECORD: + final List fields = schema.getFields(); + + final TypeInformation[] types = new TypeInformation[fields.size()]; + final String[] names = new String[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + final Schema.Field field = fields.get(i); + types[i] = convertToTypeInfo(field.schema(), legacyTimestampMapping); + names[i] = field.name(); + } + return Types.ROW_NAMED(names, types); + case ENUM: + return Types.STRING; + case ARRAY: + // result type might either be ObjectArrayTypeInfo or BasicArrayTypeInfo for Strings + return Types.OBJECT_ARRAY( + convertToTypeInfo(schema.getElementType(), legacyTimestampMapping)); + case MAP: + return Types.MAP( + Types.STRING, convertToTypeInfo(schema.getValueType(), legacyTimestampMapping)); + case UNION: + final Schema actualSchema; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + } else { + // use Kryo for serialization + return Types.GENERIC(Object.class); + } + return convertToTypeInfo(actualSchema, legacyTimestampMapping); + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + // convert fixed size binary data to primitive byte arrays + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case STRING: + // convert Avro's Utf8/CharSequence to String + return Types.STRING; + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + return Types.BIG_DEC; + } + return Types.PRIMITIVE_ARRAY(Types.BYTE); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return Types.SQL_DATE; + } else if (logicalType == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + return Types.INT; + case LONG: + if (legacyTimestampMapping) { + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.SQL_TIMESTAMP; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } else { + // Avro logical timestamp types to Flink DataStream timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis() + || schema.getLogicalType() == LogicalTypes.timestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos"))) { + return Types.INSTANT; + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis() + || schema.getLogicalType() == LogicalTypes.localTimestampMicros() + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + || (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos"))) { + return Types.LOCAL_DATE_TIME; + } else if (schema.getLogicalType() == LogicalTypes.timeMicros() + || schema.getLogicalType() == LogicalTypes.timeMillis()) { + return Types.SQL_TIME; + } + } + return Types.LONG; + case FLOAT: + return Types.FLOAT; + case DOUBLE: + return Types.DOUBLE; + case BOOLEAN: + return Types.BOOLEAN; + case NULL: + return Types.VOID; + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @return data type matching the schema + */ + public static DataType convertToDataType(String avroSchemaString) { + return convertToDataType(avroSchemaString, true); + } + + /** + * Converts an Avro schema string into a nested row structure with deterministic field order and + * data types that are compatible with Flink's Table & SQL API. + * + * @param avroSchemaString Avro schema definition string + * @param legacyTimestampMapping legacy mapping of local timestamps + * @return data type matching the schema + */ + public static DataType convertToDataType( + String avroSchemaString, boolean legacyTimestampMapping) { + Preconditions.checkNotNull(avroSchemaString, "Avro schema must not be null."); + final Schema schema; + try { + schema = new Schema.Parser().parse(avroSchemaString); + } catch (SchemaParseException e) { + throw new IllegalArgumentException("Could not parse Avro schema string.", e); + } + return convertToDataType(schema, legacyTimestampMapping); + } + + @SuppressWarnings("deprecation") + private static DataType convertToDataType(Schema schema, boolean legacyMapping) { + switch (schema.getType()) { + case RECORD: + final List schemaFields = schema.getFields(); + + final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; + for (int i = 0; i < schemaFields.size(); i++) { + final Schema.Field field = schemaFields.get(i); + fields[i] = + DataTypes.FIELD(field.name(), convertToDataType(field.schema(), legacyMapping)); + } + return DataTypes.ROW(fields).notNull(); + case ENUM: + return DataTypes.STRING().notNull(); + case ARRAY: + return DataTypes.ARRAY(convertToDataType(schema.getElementType(), legacyMapping)).notNull(); + case MAP: + return DataTypes.MAP( + DataTypes.STRING().notNull(), + convertToDataType(schema.getValueType(), legacyMapping)) + .notNull(); + case UNION: + final Schema actualSchema; + final boolean nullable; + if (schema.getTypes().size() == 2 + && schema.getTypes().get(0).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(1); + nullable = true; + } else if (schema.getTypes().size() == 2 + && schema.getTypes().get(1).getType() == Schema.Type.NULL) { + actualSchema = schema.getTypes().get(0); + nullable = true; + } else if (schema.getTypes().size() == 1) { + actualSchema = schema.getTypes().get(0); + nullable = false; + } else { + // use Kryo for serialization + return new AtomicDataType( + new TypeInformationRawType<>(false, Types.GENERIC(Object.class))); + } + DataType converted = convertToDataType(actualSchema, legacyMapping); + return nullable ? converted.nullable() : converted; + case FIXED: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + // convert fixed size binary data to primitive byte arrays + return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); + case STRING: + // convert Avro's Utf8/CharSequence to String + return DataTypes.STRING().notNull(); + case BYTES: + // logical decimal type + if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { + final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); + return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()).notNull(); + } + return DataTypes.BYTES().notNull(); + case INT: + // logical date and time type + final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); + if (logicalType == LogicalTypes.date()) { + return DataTypes.DATE().notNull(); + } else if (logicalType == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } + return DataTypes.INT().notNull(); + case LONG: + if (legacyMapping) { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } + } else { + // Avro logical timestamp types to Flink SQL timestamp types + if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(9).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { + return DataTypes.TIME(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { + return DataTypes.TIME(6).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { + return DataTypes.TIMESTAMP(3).notNull(); + } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { + return DataTypes.TIMESTAMP(6).notNull(); + } else if (schema.getLogicalType() != null + && schema.getLogicalType().getName().equals("local-timestamp-nanos")) { + // Iceberg: Added support for custom nanosecond logical type (FLINK-39251) + return DataTypes.TIMESTAMP(9).notNull(); + } + } + + return DataTypes.BIGINT().notNull(); + case FLOAT: + return DataTypes.FLOAT().notNull(); + case DOUBLE: + return DataTypes.DOUBLE().notNull(); + case BOOLEAN: + return DataTypes.BOOLEAN().notNull(); + case NULL: + return DataTypes.NULL(); + } + throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema) { + return convertToSchema(schema, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      Use "org.apache.flink.avro.generated.record" as the type name. + * + * @param schema the schema type, usually it should be the top level record type, e.g. not a + * nested type + * @param legacyTimestampMapping whether to use the legacy timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType schema, boolean legacyTimestampMapping) { + return convertToSchema( + schema, "org.apache.flink.avro.generated.record", legacyTimestampMapping); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema(LogicalType logicalType, String rowName) { + return convertToSchema(logicalType, rowName, true); + } + + /** + * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. + * + *

      The "{rowName}_" is used as the nested row type name prefix in order to generate the right + * schema. Nested record type that only differs with type name is still compatible. + * + * @param logicalType logical type + * @param rowName the record name + * @param legacyTimestampMapping whether to use legal timestamp mapping + * @return Avro's {@link Schema} matching this logical type. + */ + public static Schema convertToSchema( + LogicalType logicalType, String rowName, boolean legacyTimestampMapping) { + int precision; + boolean nullable = logicalType.isNullable(); + switch (logicalType.getTypeRoot()) { + case NULL: + return SchemaBuilder.builder().nullType(); + case BOOLEAN: + Schema bool = SchemaBuilder.builder().booleanType(); + return nullable ? nullableSchema(bool) : bool; + case TINYINT: + case SMALLINT: + case INTEGER: + Schema integer = SchemaBuilder.builder().intType(); + return nullable ? nullableSchema(integer) : integer; + case BIGINT: + Schema bigint = SchemaBuilder.builder().longType(); + return nullable ? nullableSchema(bigint) : bigint; + case FLOAT: + Schema floatSchema = SchemaBuilder.builder().floatType(); + return nullable ? nullableSchema(floatSchema) : floatSchema; + case DOUBLE: + Schema doubleSchema = SchemaBuilder.builder().doubleType(); + return nullable ? nullableSchema(doubleSchema) : doubleSchema; + case CHAR: + case VARCHAR: + Schema str = SchemaBuilder.builder().stringType(); + return nullable ? nullableSchema(str) : str; + case BINARY: + case VARBINARY: + Schema binary = SchemaBuilder.builder().bytesType(); + return nullable ? nullableSchema(binary) : binary; + case TIMESTAMP_WITHOUT_TIME_ZONE: + // use long to represents Timestamp + final TimestampType timestampType = (TimestampType) logicalType; + precision = timestampType.getPrecision(); + org.apache.avro.LogicalType avroLogicalType; + if (legacyTimestampMapping) { + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 3."); + } + } else { + if (precision <= 3) { + avroLogicalType = LogicalTypes.localTimestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.localTimestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support LOCAL TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + } + Schema timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + if (legacyTimestampMapping) { + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } else { + final LocalZonedTimestampType localZonedTimestampType = + (LocalZonedTimestampType) logicalType; + precision = localZonedTimestampType.getPrecision(); + if (precision <= 3) { + avroLogicalType = LogicalTypes.timestampMillis(); + } else if (precision <= 6) { + avroLogicalType = LogicalTypes.timestampMicros(); + } else { + throw new IllegalArgumentException( + "Avro does not support TIMESTAMP type " + + "with precision: " + + precision + + ", it only supports precision less than 6."); + } + timestamp = avroLogicalType.addToSchema(SchemaBuilder.builder().longType()); + return nullable ? nullableSchema(timestamp) : timestamp; + } + case DATE: + // use int to represents Date + Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(date) : date; + case TIME_WITHOUT_TIME_ZONE: + precision = ((TimeType) logicalType).getPrecision(); + if (precision > 3) { + throw new IllegalArgumentException( + "Avro does not support TIME type with precision: " + + precision + + ", it only supports precision less than 3."); + } + // use int to represents Time, we only support millisecond when deserialization + Schema time = LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); + return nullable ? nullableSchema(time) : time; + case DECIMAL: + DecimalType decimalType = (DecimalType) logicalType; + // store BigDecimal as byte[] + Schema decimal = + LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) + .addToSchema(SchemaBuilder.builder().bytesType()); + return nullable ? nullableSchema(decimal) : decimal; + case ROW: + RowType rowType = (RowType) logicalType; + List fieldNames = rowType.getFieldNames(); + // we have to make sure the record name is different in a Schema + SchemaBuilder.FieldAssembler builder = + SchemaBuilder.builder().record(rowName).fields(); + for (int i = 0; i < rowType.getFieldCount(); i++) { + String fieldName = fieldNames.get(i); + LogicalType fieldType = rowType.getTypeAt(i); + SchemaBuilder.GenericDefault fieldBuilder = + builder + .name(fieldName) + .type( + convertToSchema( + fieldType, rowName + "_" + fieldName, legacyTimestampMapping)); + + if (fieldType.isNullable()) { + builder = fieldBuilder.withDefault(null); + } else { + builder = fieldBuilder.noDefault(); + } + } + Schema record = builder.endRecord(); + return nullable ? nullableSchema(record) : record; + case MULTISET: + case MAP: + Schema map = + SchemaBuilder.builder() + .map() + .values(convertToSchema(extractValueTypeToAvroMap(logicalType), rowName)); + return nullable ? nullableSchema(map) : map; + case ARRAY: + ArrayType arrayType = (ArrayType) logicalType; + Schema array = + SchemaBuilder.builder() + .array() + .items(convertToSchema(arrayType.getElementType(), rowName)); + return nullable ? nullableSchema(array) : array; + case RAW: + default: + throw new UnsupportedOperationException( + "Unsupported to derive Schema for type: " + logicalType); + } + } + + public static LogicalType extractValueTypeToAvroMap(LogicalType type) { + LogicalType keyType; + LogicalType valueType; + if (type instanceof MapType) { + MapType mapType = (MapType) type; + keyType = mapType.getKeyType(); + valueType = mapType.getValueType(); + } else { + MultisetType multisetType = (MultisetType) type; + keyType = multisetType.getElementType(); + valueType = new IntType(); + } + if (!keyType.is(LogicalTypeFamily.CHARACTER_STRING)) { + throw new UnsupportedOperationException( + "Avro format doesn't support non-string as key type of map. " + + "The key type is: " + + keyType.asSummaryString()); + } + return valueType; + } + + /** Returns schema with nullable true. */ + private static Schema nullableSchema(Schema schema) { + return schema.isNullable() + ? schema + : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); + } +} diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java index f7e8e0c884cf..5f3494330cfc 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/sink/AvroGenericRecordToRowDataMapper.java @@ -21,14 +21,14 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.functions.MapFunction; -import org.apache.flink.formats.avro.AvroToRowDataConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.RowType; import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; +import org.apache.iceberg.flink.formats.avro.AvroToRowDataConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This util class converts Avro GenericRecord to Flink RowData.
      diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java index 8ef1f1fbb833..d74b8b9d620f 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/RowDataToAvroGenericRecordConverter.java @@ -23,8 +23,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.annotation.Internal; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; import org.apache.flink.table.types.logical.LogicalType; @@ -32,6 +30,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; /** * This is not serializable because Avro {@link Schema} is not actually serializable, even though it diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java index b158b0871a53..cfef780a4daa 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/source/reader/AvroGenericRecordConverter.java @@ -21,8 +21,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.flink.api.common.typeinfo.TypeInformation; -import org.apache.flink.formats.avro.RowDataToAvroConverters; -import org.apache.flink.formats.avro.typeutils.AvroSchemaConverter; import org.apache.flink.formats.avro.typeutils.GenericRecordAvroTypeInfo; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.DataType; @@ -31,6 +29,8 @@ import org.apache.flink.table.types.utils.TypeConversions; import org.apache.iceberg.avro.AvroSchemaUtil; import org.apache.iceberg.flink.FlinkSchemaUtil; +import org.apache.iceberg.flink.formats.avro.RowDataToAvroConverters; +import org.apache.iceberg.flink.formats.avro.typeutils.AvroSchemaConverter; public class AvroGenericRecordConverter implements RowDataConverter { private final Schema avroSchema; diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java index e2cd411d7069..795c4fa5a766 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/DataGenerators.java @@ -75,6 +75,11 @@ public static class Primitives implements DataGenerator { OffsetDateTime.of(2022, 1, 10, 0, 0, 0, 0, ZoneOffset.UTC); private static final LocalDateTime JAVA_LOCAL_DATE_TIME_20220110 = LocalDateTime.of(2022, 1, 10, 0, 0, 0); + private static final OffsetDateTime JAVA_OFFSET_DATE_TIME_MAX_NANO = + OffsetDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807, ZoneOffset.UTC); + private static final LocalDateTime JAVA_LOCAL_DATE_TIME_MAX_NANO = + LocalDateTime.of(2262, 4, 11, 23, 47, 16, 854_775_807); + private static final long ICEBERG_MAX_NANOS_EPOCH = 9223372036854775807L; private static final BigDecimal BIG_DECIMAL_NEGATIVE = new BigDecimal("-1.50"); private static final byte[] FIXED_BYTES = "012345689012345".getBytes(StandardCharsets.UTF_8); @@ -96,7 +101,11 @@ public static class Primitives implements DataGenerator { Types.NestedField.required(12, "uuid_field", Types.UUIDType.get()), Types.NestedField.required(13, "binary_field", Types.BinaryType.get()), Types.NestedField.required(14, "decimal_field", Types.DecimalType.of(9, 2)), - Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16))); + Types.NestedField.required(15, "fixed_field", Types.FixedType.ofLength(16)), + Types.NestedField.required( + 16, "ts_ns_with_zone_field", Types.TimestampNanoType.withZone()), + Types.NestedField.required( + 17, "ts_ns_without_zone_field", Types.TimestampNanoType.withoutZone())); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -171,6 +180,8 @@ public GenericRecord generateIcebergGenericRecord() { genericRecord.setField("time_field", JAVA_LOCAL_TIME_HOUR8); genericRecord.setField("ts_with_zone_field", JAVA_OFFSET_DATE_TIME_20220110); genericRecord.setField("ts_without_zone_field", JAVA_LOCAL_DATE_TIME_20220110); + genericRecord.setField("ts_ns_with_zone_field", JAVA_OFFSET_DATE_TIME_MAX_NANO); + genericRecord.setField("ts_ns_without_zone_field", JAVA_LOCAL_DATE_TIME_MAX_NANO); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -220,7 +231,11 @@ public GenericRowData generateFlinkRowData() { uuidBytes, binaryBytes, DecimalData.fromBigDecimal(BIG_DECIMAL_NEGATIVE, 9, 2), - FIXED_BYTES); + FIXED_BYTES, + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000)), + TimestampData.fromEpochMillis( + ICEBERG_MAX_NANOS_EPOCH / 1_000_000, (int) (ICEBERG_MAX_NANOS_EPOCH % 1_000_000))); } @Override @@ -236,10 +251,12 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { genericRecord.put("date_field", DAYS_BTW_EPOC_AND_20220110); genericRecord.put("time_field", HOUR_8_IN_MILLI); - // Although Avro logical type for timestamp fields are in micro seconds, - // AvroToRowDataConverters only looks for long value in milliseconds. - genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis()); - genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis()); + // Now that AvroToRowDataConverters correctly supports microseconds, + // we must inject correct microsecond scale values into the Avro data. + genericRecord.put("ts_with_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_without_zone_field", JODA_DATETIME_20220110.getMillis() * 1000L); + genericRecord.put("ts_ns_with_zone_field", ICEBERG_MAX_NANOS_EPOCH); + genericRecord.put("ts_ns_without_zone_field", ICEBERG_MAX_NANOS_EPOCH); byte[] uuidBytes = new byte[16]; for (int i = 0; i < 16; ++i) { @@ -554,7 +571,11 @@ public static class ArrayOfPrimitive implements DataGenerator { new Schema( Types.NestedField.required(1, "row_id", Types.StringType.get()), Types.NestedField.required( - 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get()))); + 2, "array_of_int", Types.ListType.ofOptional(101, Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "array_of_ts_ns", + Types.ListType.ofRequired(102, Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -581,13 +602,33 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("array_of_int", Arrays.asList(1, 2, 3)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { Integer[] arr = {1, 2, 3}; - return GenericRowData.of(StringData.fromString("row_id_value"), new GenericArrayData(arr)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + TimestampData[] tsArr = { + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), (int) Math.floorMod(posNanos, 1_000_000L)), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), (int) Math.floorMod(negNanos, 1_000_000L)) + }; + return GenericRowData.of( + StringData.fromString("row_id_value"), + new GenericArrayData(arr), + new GenericArrayData(tsArr)); } @Override @@ -595,6 +636,14 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("array_of_int", Arrays.asList(1, 2, 3)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put("array_of_ts_ns", Arrays.asList(posNanos, negNanos)); return genericRecord; } } @@ -808,7 +857,12 @@ public static class MapOfPrimitives implements DataGenerator { 2, "map_of_primitives", Types.MapType.ofRequired( - 101, 102, Types.StringType.get(), Types.IntegerType.get()))); + 101, 102, Types.StringType.get(), Types.IntegerType.get())), + Types.NestedField.optional( + 3, + "map_of_ts_ns", + Types.MapType.ofRequired( + 103, 104, Types.StringType.get(), Types.TimestampNanoType.withoutZone()))); private final RowType flinkRowType = FlinkSchemaUtil.convert(icebergSchema); @@ -835,15 +889,37 @@ public GenericRecord generateIcebergGenericRecord() { GenericRecord genericRecord = GenericRecord.create(icebergSchema); genericRecord.setField("row_id", "row_id_value"); genericRecord.setField("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + LocalDateTime posNanos = LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789); + LocalDateTime negNanos = LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321); + genericRecord.setField( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } @Override public GenericRowData generateFlinkRowData() { + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + return GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2))); + ImmutableMap.of(StringData.fromString("Jane"), 1, StringData.fromString("Joe"), 2)), + new GenericMapData( + ImmutableMap.of( + StringData.fromString("positive"), + TimestampData.fromEpochMillis( + Math.floorDiv(posNanos, 1_000_000L), + (int) Math.floorMod(posNanos, 1_000_000L)), + StringData.fromString("negative"), + TimestampData.fromEpochMillis( + Math.floorDiv(negNanos, 1_000_000L), + (int) Math.floorMod(negNanos, 1_000_000L))))); } @Override @@ -851,6 +927,15 @@ public org.apache.avro.generic.GenericRecord generateAvroGenericRecord() { org.apache.avro.generic.GenericRecord genericRecord = new GenericData.Record(avroSchema); genericRecord.put("row_id", "row_id_value"); genericRecord.put("map_of_primitives", ImmutableMap.of("Jane", 1, "Joe", 2)); + + long posNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(2023, 1, 1, 12, 0, 0, 123456789)); + long negNanos = + org.apache.iceberg.util.DateTimeUtil.nanosFromTimestamp( + LocalDateTime.of(1969, 12, 31, 23, 59, 59, 987654321)); + genericRecord.put( + "map_of_ts_ns", ImmutableMap.of("positive", posNanos, "negative", negNanos)); return genericRecord; } } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java index cd6964b5ed0f..0e7635a33e87 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/TestRowDataWrapper.java @@ -30,7 +30,6 @@ import org.apache.iceberg.data.Record; import org.apache.iceberg.flink.data.RandomRowData; import org.apache.iceberg.util.StructLikeWrapper; -import org.junit.jupiter.api.Disabled; public class TestRowDataWrapper extends RecordWrapperTestBase { @@ -60,18 +59,6 @@ public void testTime() { }); } - @Disabled - @Override - public void testTimestampNanoWithoutZone() { - // Flink does not support nanosecond timestamp without zone. - } - - @Disabled - @Override - public void testTimestampNanoWithZone() { - // Flink does not support nanosecond timestamp with zone. - } - @Override protected void generateAndValidate( Schema schema, RecordWrapperTestBase.AssertMethod assertMethod) { diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java index 4a70802f2a2e..b7b0a54156cc 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestFlinkOrcReaderWriter.java @@ -49,6 +49,11 @@ protected boolean allowsWritingNullValuesForRequiredFields() { return true; } + @Override + protected boolean supportsTimestampNanos() { + return true; + } + @Override protected void writeAndValidate(Schema schema) throws IOException { List expectedRecords = RandomGenericData.generate(schema, NUM_RECORDS, 1990L); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java index 4e5b38ffb026..a2411da1e344 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/data/TestRowDataProjection.java @@ -271,18 +271,19 @@ public void testMapOfPrimitivesProjection() { GenericRowData.of( StringData.fromString("row_id_value"), new GenericMapData( - ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2))); + ImmutableMap.of(StringData.fromString("foo"), 1, StringData.fromString("bar"), 2)), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData, true); testEqualsAndHashCode(schema, mapOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); GenericRowData rowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); GenericRowData copyRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("row_id_value"), null); + GenericRowData.of(StringData.fromString("row_id_value"), null, null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = - GenericRowData.of(StringData.fromString("other_row_id_value"), null); + GenericRowData.of(StringData.fromString("other_row_id_value"), null, null); testEqualsAndHashCode( schema, idOnly, @@ -432,7 +433,8 @@ public void testArrayOfPrimitiveProjection() { GenericRowData otherRowData = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, 5, 6})); + new GenericArrayData(new Integer[] {4, 5, 6}), + null); testEqualsAndHashCode(schema, idOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, arrayOnly, rowData, copyRowData, otherRowData); testEqualsAndHashCode(schema, schema, rowData, copyRowData, otherRowData); @@ -440,16 +442,19 @@ public void testArrayOfPrimitiveProjection() { GenericRowData rowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); GenericRowData copyRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("row_id_value"), - new GenericArrayData(new Integer[] {1, null, 3})); + new GenericArrayData(new Integer[] {1, null, 3}), + null); // modify the map field value GenericRowData otherRowDataNullOptionalFields = GenericRowData.of( StringData.fromString("other_row_id_value"), - new GenericArrayData(new Integer[] {4, null, 6})); + new GenericArrayData(new Integer[] {4, null, 6}), + null); testEqualsAndHashCode( schema, idOnly, From 17fc6da837442443421cfbac01ff2941a820ba20 Mon Sep 17 00:00:00 2001 From: Oguzhan Unlu Date: Thu, 7 May 2026 23:32:30 +0300 Subject: [PATCH 178/197] API, Core: Handle 404 from /v1/config for missing warehouses (#16059) * API, Core: Handle 404 from /v1/config for missing warehouses Add NoSuchWarehouseException and configErrorHandler that throws it on 404 responses with a valid error type, distinguishing missing warehouses from misconfigured URIs. Update RESTSessionCatalog to use the new handler for config calls. * move tests --- .../exceptions/NoSuchWarehouseException.java | 34 ++++++++++++++++++ .../apache/iceberg/rest/ErrorHandlers.java | 19 ++++++++++ .../iceberg/rest/RESTSessionCatalog.java | 2 +- .../iceberg/rest/TestErrorHandlers.java | 36 +++++++++++++++++++ 4 files changed, 90 insertions(+), 1 deletion(-) create mode 100644 api/src/main/java/org/apache/iceberg/exceptions/NoSuchWarehouseException.java diff --git a/api/src/main/java/org/apache/iceberg/exceptions/NoSuchWarehouseException.java b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchWarehouseException.java new file mode 100644 index 000000000000..94ae50cd1c25 --- /dev/null +++ b/api/src/main/java/org/apache/iceberg/exceptions/NoSuchWarehouseException.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.exceptions; + +import com.google.errorprone.annotations.FormatMethod; + +/** Exception raised when attempting to load a warehouse that does not exist. */ +public class NoSuchWarehouseException extends RuntimeException { + @FormatMethod + public NoSuchWarehouseException(String message, Object... args) { + super(String.format(message, args)); + } + + @FormatMethod + public NoSuchWarehouseException(Throwable cause, String message, Object... args) { + super(String.format(message, args), cause); + } +} diff --git a/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java b/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java index 791eb732bb7c..334bfde8abfc 100644 --- a/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java +++ b/core/src/main/java/org/apache/iceberg/rest/ErrorHandlers.java @@ -30,6 +30,7 @@ import org.apache.iceberg.exceptions.NoSuchPlanTaskException; import org.apache.iceberg.exceptions.NoSuchTableException; import org.apache.iceberg.exceptions.NoSuchViewException; +import org.apache.iceberg.exceptions.NoSuchWarehouseException; import org.apache.iceberg.exceptions.NotAuthorizedException; import org.apache.iceberg.exceptions.NotFoundException; import org.apache.iceberg.exceptions.RESTException; @@ -92,6 +93,10 @@ public static Consumer defaultErrorHandler() { return DefaultErrorHandler.INSTANCE; } + public static Consumer configErrorHandler() { + return ConfigErrorHandler.INSTANCE; + } + public static Consumer oauthErrorHandler() { return OAuthErrorHandler.INSTANCE; } @@ -295,6 +300,20 @@ public void accept(ErrorResponse error) { } } + /** Request error handler for config endpoint. */ + private static class ConfigErrorHandler extends DefaultErrorHandler { + private static final ErrorHandler INSTANCE = new ConfigErrorHandler(); + + @Override + public void accept(ErrorResponse error) { + if (error.code() == 404 && error.type() != null) { + throw new NoSuchWarehouseException("%s", error.message()); + } + + super.accept(error); + } + } + /** * Request error handler that handles the common cases that are included with all responses, such * as 400, 500, etc. diff --git a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java index c7b5b5d41c74..ec30d9de897e 100644 --- a/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java +++ b/core/src/main/java/org/apache/iceberg/rest/RESTSessionCatalog.java @@ -1338,7 +1338,7 @@ private static ConfigResponse fetchConfig( queryParams.build(), ConfigResponse.class, RESTUtil.configHeaders(properties), - ErrorHandlers.defaultErrorHandler()); + ErrorHandlers.configErrorHandler()); configResponse.validate(); return configResponse; } diff --git a/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java b/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java index 8bf62c3c6cf5..b7bbe337cd27 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestErrorHandlers.java @@ -20,7 +20,9 @@ import static org.assertj.core.api.Assertions.assertThatThrownBy; +import org.apache.iceberg.exceptions.NoSuchWarehouseException; import org.apache.iceberg.exceptions.RESTException; +import org.apache.iceberg.exceptions.ServiceFailureException; import org.apache.iceberg.rest.responses.ErrorResponse; import org.junit.jupiter.api.Test; @@ -68,4 +70,38 @@ public void errorHandlerWithCodeAndTypeOnly() { .isInstanceOf(RESTException.class) .hasMessage("Unable to process (code: 422, type: ValidationException): null"); } + + @Test + public void testConfigErrorHandler404ThrowsNoSuchWarehouseException() { + ErrorResponse error = + ErrorResponse.builder() + .responseCode(404) + .withType("NotFoundException") + .withMessage("Warehouse not found") + .build(); + + assertThatThrownBy(() -> ErrorHandlers.configErrorHandler().accept(error)) + .isInstanceOf(NoSuchWarehouseException.class) + .hasMessage("Warehouse not found"); + } + + @Test + public void testConfigErrorHandler404ForMisconfiguredUri() { + ErrorResponse error = + ErrorResponse.builder().responseCode(404).withMessage("Not Found").build(); + + assertThatThrownBy(() -> ErrorHandlers.configErrorHandler().accept(error)) + .isInstanceOf(RESTException.class) + .hasMessageContaining("Not Found"); + } + + @Test + public void testConfigErrorHandlerDelegatesToDefaultForNon404() { + ErrorResponse error = + ErrorResponse.builder().responseCode(500).withMessage("Internal server error").build(); + + assertThatThrownBy(() -> ErrorHandlers.configErrorHandler().accept(error)) + .isInstanceOf(ServiceFailureException.class) + .hasMessageContaining("Internal server error"); + } } From 57b1211b7477f9f4a5e79a4cf6f6505d63ded4e8 Mon Sep 17 00:00:00 2001 From: Steven Zhen Wu Date: Thu, 7 May 2026 16:42:57 -0700 Subject: [PATCH 179/197] Spark: backport PR #15512 to v3.4, v3.5, v4.0 for WAP branch delete fix (#16245) * Spark: backport PR #15512 to v3.4, v3.5, v4.0 for WAP branch delete fix When WAP is enabled via spark.wap.branch, canDeleteWhere() previously scanned the main branch while deleteWhere() committed to the WAP branch. This could cause canDeleteWhere() to incorrectly approve a metadata-only delete based on data that was never on the WAP branch, surfacing as "Cannot delete file where some, but not all, rows match filter" at commit time. Resolve the scan branch the same way deleteWhere resolves the write branch (with a fall-back to main when the WAP branch has not been created yet), and pass it through canDeleteUsingMetadata. Co-Authored-By: Claude Opus 4.7 (1M context) * Spark: add blank lines after if blocks in scanBranchForDelete (style) Iceberg style requires an empty line between a control flow block and the following statement. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../iceberg/spark/extensions/TestDelete.java | 56 +++++++++++++++++++ .../iceberg/spark/source/SparkTable.java | 31 ++++++++-- .../iceberg/spark/extensions/TestDelete.java | 56 +++++++++++++++++++ .../iceberg/spark/source/SparkTable.java | 31 ++++++++-- .../iceberg/spark/extensions/TestDelete.java | 56 +++++++++++++++++++ .../iceberg/spark/source/SparkTable.java | 31 ++++++++-- 6 files changed, 246 insertions(+), 15 deletions(-) diff --git a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index 1dd6db48f7d8..b106e8fc38f3 100644 --- a/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.4/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1395,6 +1395,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 871ef9355200..1348afff6475 100644 --- a/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.4/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -57,6 +57,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.SparkSQLProperties; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; @@ -334,11 +335,31 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + return canDeleteUsingMetadata(deleteExpr, scanBranchForDelete()); + } + + // Resolves the branch to scan during canDeleteWhere so it matches the branch deleteWhere + // will commit to. Falls back to main when WAP is configured but the WAP branch does not + // exist yet, since this is a read scan. + private String scanBranchForDelete() { + if (branch != null) { + return branch; + } + + if (!SparkTableUtil.wapEnabled(table())) { + return null; + } + + String wapBranch = sparkSession().conf().get(SparkSQLProperties.WAP_BRANCH, null); + if (wapBranch != null && table().refs().containsKey(wapBranch)) { + return wapBranch; + } + + return null; } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(sparkSession()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -353,14 +374,14 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (branch != null) { - scan = scan.useRef(branch); + if (scanBranch != null) { + scan = scan.useRef(scanBranch); } try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); StrictMetricsEvaluator metricsEvaluator = - new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), branch), deleteExpr); + new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), scanBranch), deleteExpr); return Iterables.all( tasks, diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index fbf6ce3559a7..79d6bea12f67 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1422,6 +1422,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 871ef9355200..1348afff6475 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -57,6 +57,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.SparkSQLProperties; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; @@ -334,11 +335,31 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + return canDeleteUsingMetadata(deleteExpr, scanBranchForDelete()); + } + + // Resolves the branch to scan during canDeleteWhere so it matches the branch deleteWhere + // will commit to. Falls back to main when WAP is configured but the WAP branch does not + // exist yet, since this is a read scan. + private String scanBranchForDelete() { + if (branch != null) { + return branch; + } + + if (!SparkTableUtil.wapEnabled(table())) { + return null; + } + + String wapBranch = sparkSession().conf().get(SparkSQLProperties.WAP_BRANCH, null); + if (wapBranch != null && table().refs().containsKey(wapBranch)) { + return wapBranch; + } + + return null; } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(sparkSession()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -353,14 +374,14 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (branch != null) { - scan = scan.useRef(branch); + if (scanBranch != null) { + scan = scan.useRef(scanBranch); } try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); StrictMetricsEvaluator metricsEvaluator = - new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), branch), deleteExpr); + new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), scanBranch), deleteExpr); return Iterables.all( tasks, diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java index fbf6ce3559a7..79d6bea12f67 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java @@ -1422,6 +1422,62 @@ public void testDeleteToCustomWapBranchWithoutWhereClause() throws NoSuchTableEx }); } + @TestTemplate + public void testDeleteToWapBranchCanDeleteWhereScansWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(1, "hr"), new Employee(2, "hr")); + + sql("DELETE FROM %s WHERE id = 1", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("DELETE should remove the matching rows from the WAP branch") + .containsExactly(row(0, "hr"), row(2, "hr")); + assertThat(sql("SELECT id, dep FROM %s.branch_main", tableName)) + .as("Main branch must not be modified by a WAP-targeted DELETE") + .containsExactly(row(1, "hr")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + + @TestTemplate + public void testMetadataDeleteToWapBranchCommitsToWapBranch() throws NoSuchTableException { + assumeThat(branch).as("WAP branch only works for table identifier without branch").isNull(); + + createAndInitPartitionedTable(); + sql( + "ALTER TABLE %s SET TBLPROPERTIES ('%s' = 'true')", + tableName, TableProperties.WRITE_AUDIT_PUBLISH_ENABLED); + + append(tableName, new Employee(1, "hr"), new Employee(5, "eng")); + + spark.conf().set(SparkSQLProperties.WAP_BRANCH, "wap"); + try { + append(tableName, new Employee(0, "hr"), new Employee(2, "eng")); + + sql("DELETE FROM %s WHERE dep = 'hr'", tableName); + + assertThat(sql("SELECT id, dep FROM %s.branch_wap ORDER BY id", tableName)) + .as("Metadata delete should remove the hr partition on the WAP branch") + .containsExactly(row(2, "eng"), row(5, "eng")); + assertThat(sql("SELECT id, dep FROM %s.branch_main ORDER BY id", tableName)) + .as("Metadata delete must not commit to main when WAP is set") + .containsExactly(row(1, "hr"), row(5, "eng")); + } finally { + spark.conf().unset(SparkSQLProperties.WAP_BRANCH); + } + } + @TestTemplate public void testDeleteWithFilterOnNestedColumn() { createAndInitNestedColumnsTable(); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java index 6f0f992f1c20..9e3c9a7e69e6 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java @@ -58,6 +58,7 @@ import org.apache.iceberg.spark.CommitMetadata; import org.apache.iceberg.spark.Spark3Util; import org.apache.iceberg.spark.SparkReadOptions; +import org.apache.iceberg.spark.SparkSQLProperties; import org.apache.iceberg.spark.SparkSchemaUtil; import org.apache.iceberg.spark.SparkTableUtil; import org.apache.iceberg.spark.SparkUtil; @@ -376,11 +377,31 @@ public boolean canDeleteWhere(Predicate[] predicates) { } } - return canDeleteUsingMetadata(deleteExpr); + return canDeleteUsingMetadata(deleteExpr, scanBranchForDelete()); + } + + // Resolves the branch to scan during canDeleteWhere so it matches the branch deleteWhere + // will commit to. Falls back to main when WAP is configured but the WAP branch does not + // exist yet, since this is a read scan. + private String scanBranchForDelete() { + if (branch != null) { + return branch; + } + + if (!SparkTableUtil.wapEnabled(table())) { + return null; + } + + String wapBranch = sparkSession().conf().get(SparkSQLProperties.WAP_BRANCH, null); + if (wapBranch != null && table().refs().containsKey(wapBranch)) { + return wapBranch; + } + + return null; } // a metadata delete is possible iff matching files can be deleted entirely - private boolean canDeleteUsingMetadata(Expression deleteExpr) { + private boolean canDeleteUsingMetadata(Expression deleteExpr, String scanBranch) { boolean caseSensitive = SparkUtil.caseSensitive(sparkSession()); if (ExpressionUtil.selectsPartitions(deleteExpr, table(), caseSensitive)) { @@ -395,14 +416,14 @@ private boolean canDeleteUsingMetadata(Expression deleteExpr) { .includeColumnStats() .ignoreResiduals(); - if (branch != null) { - scan = scan.useRef(branch); + if (scanBranch != null) { + scan = scan.useRef(scanBranch); } try (CloseableIterable tasks = scan.planFiles()) { Map evaluators = Maps.newHashMap(); StrictMetricsEvaluator metricsEvaluator = - new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), branch), deleteExpr); + new StrictMetricsEvaluator(SnapshotUtil.schemaFor(table(), scanBranch), deleteExpr); return Iterables.all( tasks, From 77e7dbb245908b42277130eb3417d823b5932876 Mon Sep 17 00:00:00 2001 From: GuoYu <511955993@qq.com> Date: Fri, 8 May 2026 21:46:29 +0800 Subject: [PATCH 180/197] ORC: Add _row_id and _last_updated_sequence_number raeder in Orc to support lineage (#15776) --- .../iceberg/data/orc/TestGenericData.java | 11 +- .../iceberg/flink/data/FlinkOrcReader.java | 2 +- .../iceberg/flink/data/FlinkOrcReaders.java | 15 +- .../maintenance/api/TestRewriteDataFiles.java | 51 ++-- .../operator/OperatorTestBase.java | 35 ++- .../iceberg/data/orc/GenericOrcReader.java | 2 +- .../iceberg/data/orc/GenericOrcReaders.java | 30 +++ .../main/java/org/apache/iceberg/orc/ORC.java | 10 +- .../apache/iceberg/orc/OrcValueReaders.java | 218 +++++++++++++++++- .../TestRowLevelOperationsWithLineage.java | 12 + .../iceberg/spark/data/SparkOrcReader.java | 2 +- .../spark/data/SparkOrcValueReaders.java | 15 +- 12 files changed, 357 insertions(+), 46 deletions(-) diff --git a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java index 3a7fec6962bb..cc318b2f53b1 100644 --- a/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java +++ b/data/src/test/java/org/apache/iceberg/data/orc/TestGenericData.java @@ -68,6 +68,11 @@ protected boolean supportsUnknown() { return true; } + @Override + protected boolean supportsRowLineage() { + return true; + } + /** Orc writers don't have notion of non-null / required fields. */ @Override protected boolean allowsWritingNullValuesForRequiredFields() { @@ -250,13 +255,15 @@ private void writeAndValidateRecords(Schema schema, List expected) throw try (CloseableIterable reader = ORC.read(Files.localInput(testFile)) .project(schema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(schema, fileSchema)) + .createReaderFunc( + fileSchema -> GenericOrcReader.buildReader(schema, fileSchema, ID_TO_CONSTANT)) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { - DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); + DataTestHelpers.assertEquals( + schema.asStruct(), expected.get(i), rows.get(i), ID_TO_CONSTANT, i); } } } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 3e3a29112cf4..77f16bfdb2ab 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -70,7 +70,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + return FlinkOrcReaders.struct(record, fields, iStruct, idToConstant); } @Override diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 7a4a15c7e600..c5c958fbdb04 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -39,6 +39,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -91,8 +92,11 @@ public static OrcValueReader map( } public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } private static class StringReader implements OrcValueReader { @@ -265,8 +269,11 @@ private static class StructReader extends OrcValueReaders.StructReader private final int numFields; StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index 97b8b6786545..88b949a9a7f8 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -44,8 +45,14 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.FieldSource; class TestRewriteDataFiles extends MaintenanceTaskTestBase { + + private static final FileFormat[] FILE_FORMATS = + new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; + @Test void testRewriteUnpartitioned() throws Exception { Table table = createTable(); @@ -83,13 +90,14 @@ void testRewriteUnpartitioned() throws Exception { createRecord(4, "d"))); } - @Test - void testRewriteUnpartitionedPreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteUnpartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); + insert(table, 3, "c", fileFormat); + insert(table, 4, "d", fileFormat); assertFileNum(table, 4, 0); @@ -123,15 +131,17 @@ void testRewriteUnpartitionedPreserveLineage() throws Exception { schema); } - @Test - void testRewriteTheSameFilePreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteTheSameFilePreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); // Create a file with two lines of data to verify that the rowid is read correctly. insert( table, - ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d"))); + ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d")), + fileFormat); assertFileNum(table, 3, 0); @@ -167,13 +177,14 @@ void testRewriteTheSameFilePreserveLineage() throws Exception { schema); } - @Test - void testRewritePartitionedPreserveLineage() throws Exception { - Table table = createPartitionedTable(3); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewritePartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createPartitionedTable(3, fileFormat); + insertPartitioned(table, 1, "p1", fileFormat); + insertPartitioned(table, 2, "p1", fileFormat); + insertPartitioned(table, 3, "p2", fileFormat); + insertPartitioned(table, 4, "p2", fileFormat); assertFileNum(table, 4, 0); diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index 6dd6cda84f27..d6563e782e43 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -133,10 +133,14 @@ void after() throws IOException { } protected static Table createTable() { - return createTable(2); + return createTable(2, FileFormat.PARQUET); } protected static Table createTable(int formatVersion) { + return createPartitionedTable(formatVersion, FileFormat.PARQUET); + } + + protected static Table createTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -145,6 +149,8 @@ protected static Table createTable(int formatVersion) { PartitionSpec.unpartitioned(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), TableProperties.FORMAT_VERSION, String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -182,7 +188,7 @@ protected static Table createTableWithDelete(int formatVersion) { "format-version", String.valueOf(formatVersion), "write.upsert.enabled", "true")); } - protected static Table createPartitionedTable(int formatVersion) { + protected static Table createPartitionedTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -191,6 +197,8 @@ protected static Table createPartitionedTable(int formatVersion) { PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), "format-version", String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -198,17 +206,27 @@ protected static Table createPartitionedTable(int formatVersion) { } protected static Table createPartitionedTable() { - return createPartitionedTable(2); + return createPartitionedTable(2, FileFormat.PARQUET); } protected void insert(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insert(table, id, data, FileFormat.PARQUET); + } + + protected void insert(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); } protected void insert(Table table, List records) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir).appendToTable(records); + insert(table, records, FileFormat.PARQUET); + } + + protected void insert(Table table, List records, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir).appendToTable(records); table.refresh(); } @@ -309,7 +327,12 @@ protected void update(Table table, Integer id, String oldData, String tempData, } protected void insertPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insertPartitioned(table, id, data, FileFormat.PARQUET); + } + + protected void insertPartitioned(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable( TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java index 74c1303dfeda..3bd8bfbfd6b7 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReader.java @@ -76,7 +76,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return GenericOrcReaders.struct(fields, expected, idToConstant); + return GenericOrcReaders.struct(record, fields, expected, idToConstant); } @Override diff --git a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java index ba8cbbb749a7..faa62f770e4d 100644 --- a/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java +++ b/orc/src/main/java/org/apache/iceberg/data/orc/GenericOrcReaders.java @@ -43,6 +43,7 @@ import org.apache.iceberg.variants.Variant; import org.apache.iceberg.variants.VariantMetadata; import org.apache.iceberg.variants.VariantValue; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -56,11 +57,25 @@ public class GenericOrcReaders { private GenericOrcReaders() {} + /** + * @deprecated Use {@link #struct(TypeDescription, List, Types.StructType, Map)} instead. This + * method uses position-based binding which may cause field misalignment in MOR and lineage + * scenarios. + */ + @Deprecated public static OrcValueReader struct( List> readers, Types.StructType struct, Map idToConstant) { return new StructReader(readers, struct, idToConstant); } + public static OrcValueReader struct( + TypeDescription orcType, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(orcType, readers, struct, idToConstant); + } + public static OrcValueReader> array(OrcValueReader elementReader) { return new ListReader(elementReader); } @@ -231,6 +246,12 @@ public Variant nonNullRead(ColumnVector vector, int row) { private static class StructReader extends OrcValueReaders.StructReader { private final GenericRecord template; + /** + * @deprecated Use {@link #StructReader(TypeDescription, List, Types.StructType, Map)} instead. + * This constructor uses position-based binding which may cause field misalignment in MOR + * and lineage scenarios. + */ + @Deprecated protected StructReader( List> readers, Types.StructType structType, @@ -239,6 +260,15 @@ protected StructReader( this.template = GenericRecord.create(structType); } + protected StructReader( + TypeDescription orcType, + List> readers, + Types.StructType structType, + Map idToConstant) { + super(orcType, readers, structType, idToConstant); + this.template = GenericRecord.create(structType); + } + @Override protected Record create() { // GenericRecord.copy() is more performant then GenericRecord.create(StructType) since diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java index 2c8fd6e436b2..9fb805246962 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java @@ -787,11 +787,17 @@ ReadBuilder constantFieldIds(Set newConstantFieldIds) { public CloseableIterable build() { Preconditions.checkNotNull(schema, "Schema is required"); + Set idsToExclude = + Sets.difference( + Sets.union(constantFieldIds, MetadataColumns.metadataFieldIds()), + ImmutableSet.of( + MetadataColumns.ROW_ID.fieldId(), + MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER.fieldId())); + return new OrcIterable<>( file, conf, - TypeUtil.selectNot( - schema, Sets.union(constantFieldIds, MetadataColumns.metadataFieldIds())), + TypeUtil.selectNot(schema, idsToExclude), nameMapping, start, length, diff --git a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java index b6d40a3d7d00..c1fba3f15add 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java +++ b/orc/src/main/java/org/apache/iceberg/orc/OrcValueReaders.java @@ -22,8 +22,11 @@ import java.util.List; import java.util.Map; import org.apache.iceberg.MetadataColumns; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DoubleColumnVector; @@ -135,12 +138,27 @@ public byte[] nonNullRead(ColumnVector vector, int row) { public abstract static class StructReader implements OrcValueReader { private final OrcValueReader[] readers; private final boolean[] isConstantOrMetadataField; - + // Maps each projected struct field position to the matching child index in the ORC schema. + // This allows fields to be read by Iceberg field ID when the projected struct order differs + // from the file schema. + private final int[] orcFieldIndex; + + /** + * @param readers readers for each field + * @param struct struct type + * @param idToConstant constant values by field id + * @deprecated Use {@link #StructReader(TypeDescription, List, Types.StructType, Map)} instead. + * This constructor uses position-based binding which may cause field misalignment in MOR + * scenarios. This doesn't work lineage scenarios. + */ + @Deprecated protected StructReader( List> readers, Types.StructType struct, Map idToConstant) { List fields = struct.fields(); this.readers = new OrcValueReader[fields.size()]; this.isConstantOrMetadataField = new boolean[fields.size()]; + this.orcFieldIndex = null; + for (int pos = 0, readerIndex = 0; pos < fields.size(); pos += 1) { Types.NestedField field = fields.get(pos); if (idToConstant.containsKey(field.fieldId())) { @@ -154,7 +172,6 @@ protected StructReader( this.readers[pos] = constants(false); } else if (MetadataColumns.isMetadataColumn(field.name()) || field.type().typeId() == Type.TypeID.UNKNOWN) { - // in case of any other metadata field, fill with nulls this.isConstantOrMetadataField[pos] = true; this.readers[pos] = constants(null); } else { @@ -163,6 +180,122 @@ protected StructReader( } } + protected StructReader( + TypeDescription orcType, + List> readers, + Types.StructType struct, + Map idToConstant) { + List fields = struct.fields(); + this.readers = new OrcValueReader[fields.size()]; + this.isConstantOrMetadataField = new boolean[fields.size()]; + this.orcFieldIndex = new int[fields.size()]; + + Map> readersById = readersByFieldId(orcType, readers); + Map fieldIdToOrcIndex = buildFieldIdToOrcIndex(orcType); + + for (int pos = 0; pos < fields.size(); pos += 1) { + Types.NestedField field = fields.get(pos); + OrcValueReader fileReader = readersById.get(field.fieldId()); + int orcIndex = fieldIdToOrcIndex.getOrDefault(field.fieldId(), -1); + + if (field.equals(MetadataColumns.ROW_ID)) { + handleRowIdField(pos, field, fileReader, idToConstant, orcIndex); + } else if (field.equals(MetadataColumns.LAST_UPDATED_SEQUENCE_NUMBER)) { + handleLastUpdatedSeqField(pos, field, fileReader, idToConstant, orcIndex); + } else if (idToConstant.containsKey(field.fieldId())) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(idToConstant.get(field.fieldId())); + } else if (field.equals(MetadataColumns.ROW_POSITION)) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = new RowPositionReader(); + } else if (field.equals(MetadataColumns.IS_DELETED)) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(false); + } else if (fileReader != null) { + this.isConstantOrMetadataField[pos] = false; + this.orcFieldIndex[pos] = fieldIdToOrcIndex.getOrDefault(field.fieldId(), -1); + this.readers[pos] = fileReader; + } else if (MetadataColumns.isMetadataColumn(field.name()) + || field.type().typeId() == Type.TypeID.UNKNOWN) { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(null); + } else { + throw new IllegalArgumentException( + String.format("Missing ORC reader for field %s (%s)", field.name(), field.fieldId())); + } + } + } + + private Map buildFieldIdToOrcIndex(TypeDescription orcType) { + List children = orcType.getChildren(); + Map mapping = Maps.newHashMap(); + for (int i = 0; i < children.size(); i++) { + mapping.put(ORCSchemaUtil.fieldId(children.get(i)), i); + } + + return mapping; + } + + private Map> readersByFieldId( + TypeDescription orcType, List> readerList) { + List children = orcType.getChildren(); + Preconditions.checkState( + children.size() == readerList.size(), + "Invalid ORC reader binding: children=%s readers=%s", + children.size(), + readerList.size()); + + Map> readersById = Maps.newHashMap(); + for (int i = 0; i < children.size(); i += 1) { + readersById.put(ORCSchemaUtil.fieldId(children.get(i)), readerList.get(i)); + } + + return readersById; + } + + @SuppressWarnings("unchecked") + private void handleRowIdField( + int pos, + Types.NestedField field, + OrcValueReader fileReader, + Map idToConstant, + int orcIndex) { + Long firstRowId = (Long) idToConstant.get(field.fieldId()); + if (firstRowId != null) { + OrcValueReader fileIdReader = (OrcValueReader) fileReader; + this.readers[pos] = new RowIdReader(firstRowId, fileIdReader); + this.isConstantOrMetadataField[pos] = fileIdReader == null; + if (fileIdReader != null) { + this.orcFieldIndex[pos] = orcIndex; + } + } else { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(null); + } + } + + @SuppressWarnings("unchecked") + private void handleLastUpdatedSeqField( + int pos, + Types.NestedField field, + OrcValueReader fileReader, + Map idToConstant, + int orcIndex) { + Long fileLastUpdated = (Long) idToConstant.get(field.fieldId()); + Long firstRowId = (Long) idToConstant.get(MetadataColumns.ROW_ID.fieldId()); + if (fileLastUpdated != null && firstRowId != null) { + OrcValueReader fileSeqReader = (OrcValueReader) fileReader; + this.readers[pos] = new LastUpdatedSeqReader(fileLastUpdated, fileSeqReader); + this.isConstantOrMetadataField[pos] = fileSeqReader == null; + if (fileSeqReader != null) { + this.orcFieldIndex[pos] = orcIndex; + } + } else { + this.isConstantOrMetadataField[pos] = true; + this.readers[pos] = constants(null); + } + } + protected abstract T create(); protected abstract void set(T struct, int pos, Object value); @@ -178,14 +311,17 @@ public T nonNullRead(ColumnVector vector, int row) { } private T readInternal(T struct, ColumnVector[] columnVectors, int row) { - for (int c = 0, vectorIndex = 0; c < readers.length; ++c) { + int vectorIndex = 0; + for (int c = 0; c < readers.length; ++c) { ColumnVector vector; if (isConstantOrMetadataField[c]) { vector = null; + } else if (orcFieldIndex != null) { + vector = columnVectors[orcFieldIndex[c]]; } else { - vector = columnVectors[vectorIndex]; - vectorIndex++; + vector = columnVectors[vectorIndex++]; } + set(struct, c, reader(c).read(vector, row)); } return struct; @@ -235,4 +371,76 @@ public void setBatchContext(long newBatchOffsetInFile) { this.batchOffsetInFile = newBatchOffsetInFile; } } + + private static class RowIdReader implements OrcValueReader { + private final long firstRowId; + private final OrcValueReader fileIdReader; + private final RowPositionReader posReader; + + RowIdReader(long firstRowId, OrcValueReader fileIdReader) { + this.firstRowId = firstRowId; + this.fileIdReader = fileIdReader; + this.posReader = new RowPositionReader(); + } + + @Override + public Long read(ColumnVector vector, int row) { + if (fileIdReader != null) { + Long idFromFile = fileIdReader.read(vector, row); + if (idFromFile != null) { + return idFromFile; + } + } + + long pos = posReader.read(null, row); + return firstRowId + pos; + } + + @Override + public Long nonNullRead(ColumnVector vector, int row) { + return read(vector, row); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + posReader.setBatchContext(batchOffsetInFile); + if (fileIdReader != null) { + fileIdReader.setBatchContext(batchOffsetInFile); + } + } + } + + private static class LastUpdatedSeqReader implements OrcValueReader { + private final long fileLastUpdated; + private final OrcValueReader fileSeqReader; + + LastUpdatedSeqReader(long fileLastUpdated, OrcValueReader fileSeqReader) { + this.fileLastUpdated = fileLastUpdated; + this.fileSeqReader = fileSeqReader; + } + + @Override + public Long read(ColumnVector vector, int row) { + if (fileSeqReader != null) { + Long seqFromFile = fileSeqReader.read(vector, row); + if (seqFromFile != null) { + return seqFromFile; + } + } + + return fileLastUpdated; + } + + @Override + public Long nonNullRead(ColumnVector vector, int row) { + return read(vector, row); + } + + @Override + public void setBatchContext(long batchOffsetInFile) { + if (fileSeqReader != null) { + fileSeqReader.setBatchContext(batchOffsetInFile); + } + } + } } diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java index 77303685235d..f38178a8e883 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestRowLevelOperationsWithLineage.java @@ -95,6 +95,18 @@ record -> + " fanout = {6}, branch = {7}, planningMode = {8}, formatVersion = {9}") public static Object[][] parameters() { return new Object[][] { + { + "testhadoop", + SparkCatalog.class.getName(), + ImmutableMap.of("type", "hadoop"), + FileFormat.ORC, + false, + WRITE_DISTRIBUTION_MODE_HASH, + true, + null, + LOCAL, + 3 + }, { "testhadoop", SparkCatalog.class.getName(), diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java index c20be44f6735..c0d3d3efe026 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java @@ -77,7 +77,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return SparkOrcValueReaders.struct(fields, expected, idToConstant); + return SparkOrcValueReaders.struct(record, fields, expected, idToConstant); } @Override diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java index 670537fbf872..67664ac6c753 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcValueReaders.java @@ -28,6 +28,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.UUIDUtil; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -70,8 +71,11 @@ public static OrcValueReader decimals(int precision, int scale) { } static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } static OrcValueReader array(OrcValueReader elementReader) { @@ -143,8 +147,11 @@ static class StructReader extends OrcValueReaders.StructReader { private final int numFields; protected StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } From b7e65c902936141666d56acad62c0eacf6aea3c5 Mon Sep 17 00:00:00 2001 From: Mukund Thakur Date: Fri, 8 May 2026 10:16:52 -0500 Subject: [PATCH 181/197] Core: Add test to validate we can't delete map value during schema evolution (#15767) --- .../java/org/apache/iceberg/TestSchemaUpdate.java | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java index fb942dde2aa2..5325e4013c68 100644 --- a/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java +++ b/core/src/test/java/org/apache/iceberg/TestSchemaUpdate.java @@ -1181,6 +1181,17 @@ public void testDeleteMapKey() { .hasMessageStartingWith("Cannot delete map keys"); } + @Test + public void testDeleteMapValue() { + assertThatThrownBy( + () -> + new SchemaUpdate(SCHEMA, SCHEMA_LAST_COLUMN_ID) + .deleteColumn("locations.value") + .apply()) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageStartingWith("Cannot delete value type from map"); + } + @Test public void testAddFieldToMapKey() { assertThatThrownBy( From 299f7be3987c87c8812234f85131d594b5553db6 Mon Sep 17 00:00:00 2001 From: gaborkaszab Date: Fri, 8 May 2026 19:29:29 +0200 Subject: [PATCH 182/197] OpenAPI, Core: Disambiguate the intent of REFS snapshot mode (#16252) * Spec, Core: Disambiguate the intent of REFS snapshot mode Spell out that it has an effect on the 'snapshots' and not the 'snapshot-log' part of the response. Some implementations already got it wrong. * Update core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java Co-authored-by: Eduard Tudenhoefner --------- Co-authored-by: Eduard Tudenhoefner --- .../apache/iceberg/rest/TestRESTCatalog.java | 25 +++++++++++++++++++ open-api/rest-catalog-open-api.yaml | 6 ++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java index e4fa156059d8..017f400f860f 100644 --- a/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java +++ b/core/src/test/java/org/apache/iceberg/rest/TestRESTCatalog.java @@ -62,6 +62,7 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.HasTableOperations; +import org.apache.iceberg.HistoryEntry; import org.apache.iceberg.MetadataUpdate; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -1086,6 +1087,14 @@ public void testTableSnapshotLoading() { .asInstanceOf(InstanceOfAssertFactories.list(Snapshot.class)) .hasSize(1); + // snapshot log is complete regardless REFS mode + assertThat(((BaseTable) refsTable).operations().current()) + .extracting("snapshotLog") + .asInstanceOf(InstanceOfAssertFactories.list(HistoryEntry.class)) + .hasSize(2) + .containsExactlyInAnyOrderElementsOf( + ((BaseTable) table).operations().current().snapshotLog()); + assertThat(refsTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); // verify that the table was loaded with the refs argument @@ -1180,6 +1189,14 @@ public void testTableSnapshotLoadingWithDivergedBranches(String formatVersion) { .asInstanceOf(InstanceOfAssertFactories.list(Snapshot.class)) .hasSize(2); + // snapshot log is complete regardless REFS mode + assertThat(((BaseTable) refsTable).operations().current()) + .extracting("snapshotLog") + .asInstanceOf(InstanceOfAssertFactories.list(HistoryEntry.class)) + .hasSize(1) // main branch has a single snapshot + .containsExactlyInAnyOrderElementsOf( + ((BaseTable) table).operations().current().snapshotLog()); + assertThat(refsTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); // verify that the table was loaded with the refs argument @@ -1265,6 +1282,14 @@ public void lazySnapshotLoadingWithDivergedHistory() { .asInstanceOf(InstanceOfAssertFactories.list(Snapshot.class)) .hasSize(1); + // snapshot log is complete regardless REFS mode + assertThat(((BaseTable) refsTable).operations().current()) + .extracting("snapshotLog") + .asInstanceOf(InstanceOfAssertFactories.list(HistoryEntry.class)) + .hasSize(numSnapshots) + .containsExactlyInAnyOrderElementsOf( + ((BaseTable) table).operations().current().snapshotLog()); + assertThat(refsTable.currentSnapshot()).isEqualTo(table.currentSnapshot()); assertThat(refsTable.snapshots()).hasSize(numSnapshots); assertThat(refsTable.history()).hasSize(numSnapshots); diff --git a/open-api/rest-catalog-open-api.yaml b/open-api/rest-catalog-open-api.yaml index 2435cd43f0e5..06d13ec133d9 100644 --- a/open-api/rest-catalog-open-api.yaml +++ b/open-api/rest-catalog-open-api.yaml @@ -983,9 +983,9 @@ paths: - in: query name: snapshots description: - The snapshots to return in the body of the metadata. Setting the value to `all` would - return the full set of snapshots currently valid for the table. Setting the value to - `refs` would load all snapshots referenced by branches or tags. + The snapshots to return in the body of the metadata via the `snapshots` field. Setting + the value to `all` would return the full set of snapshots currently valid for the table. + Setting the value to `refs` would load all snapshots referenced by branches or tags. Default if no param is provided is `all`. required: false From 4efbda3e45c86223d229af46e67e14a6232b5d10 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Fri, 8 May 2026 10:30:23 -0700 Subject: [PATCH 183/197] Add Oracle as an Iceberg vendor (#16251) --- site/docs/vendors.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/site/docs/vendors.md b/site/docs/vendors.md index 4260553e96cd..c1ca0e7eb23d 100644 --- a/site/docs/vendors.md +++ b/site/docs/vendors.md @@ -141,6 +141,10 @@ IOMETE is a fully-managed ready to use, batteries included Data Platform. IOMETE ### [Microsoft OneLake](https://learn.microsoft.com/en-us/fabric/onelake/) [Microsoft OneLake](https://learn.microsoft.com/en-us/fabric/onelake/) is a single unified data lake that brings together your entire data estate into an open, secure foundation for analytics across the organization. Built into Microsoft Fabric, OneLake delivers two powerful APIs: the Tables API and the Files API. The [OneLake Tables API](https://aka.ms/onelakeircdocs) supports the Apache Iceberg REST Catalog (IRC) specification, making it simple to create, manage, and integrate Iceberg tables with existing tools and workflows. The OneLake Files API offers full Azure Data Lake Storage (ADLS) compatibility, enabling seamless file operations and interoperability with familiar ADLS tools. Together, these APIs make OneLake a truly open and interoperable data lake, delivering flexibility and connectivity for modern analytics and AI-driven pipelines. +### [Oracle](https://oracle.com/) + +Oracle [Autonomous AI Lakehouse](https://www.oracle.com/autonomous-database/autonomous-ai-lakehouse/) combines the openness of Apache Iceberg with the performance, automation, and security of Oracle Autonomous Database and Exadata. Available across Oracle Cloud Infrastructure (OCI), Microsoft Azure, Google Cloud, and AWS, Oracle provides a multicloud, open lakehouse architecture with high-performance access to Iceberg tables through integration with existing catalogs and support for the Apache Iceberg REST Catalog specification. Oracle enables interoperability across engines such as Apache Spark, Trino, and Apache Flink while minimizing data movement and preserving vendor independence. Built-in AI, vector search, graph analytics, and JSON-relational capabilities allow organizations to run advanced analytics and AI workloads directly on Iceberg data with enterprise-grade governance, availability, and serverless scalability. + ### [PuppyGraph](https://puppygraph.com) PuppyGraph is a cloud-native graph analytics engine that enables users to query one or more relational data stores as a unified graph model. This eliminates the overhead of deploying and maintaining a siloed graph database system, with no ETL required. [PuppyGraph’s native Apache Iceberg integration](https://docs.puppygraph.com/user-manual/getting-started/iceberg) adds native graph capabilities to your existing data lake in an easy and performant way. From b84d4468979bb29350cd0ab4246a2c56787f8998 Mon Sep 17 00:00:00 2001 From: Amogh Jahagirdar Date: Fri, 8 May 2026 11:32:37 -0600 Subject: [PATCH 184/197] Spec: Update formatting in tables to use material content tabs (#14656) * Spec: Udpate formatting to use material content tabs * Collapse v1-v3 into a single tab * Spec: Restore content dropped during tab formatting refactor Restore four pieces of content that were accidentally removed in the formatting-only tab refactor, as flagged by Steven's review: - column_sizes: restore "Does not include bytes necessary to read other columns, like footers." sentence - partitions: restore "(see below)" cross-reference to field_summary table - partition-spec: restore note that writers use this field but readers use specs from manifest files - properties: restore commit.retry.num-retries example Co-Authored-By: Claude Sonnet 4.6 * add back (see below) --------- Co-authored-by: Claude Sonnet 4.6 Co-authored-by: Kevin Liu --- format/spec.md | 322 +++++++++++++++++++++++++------------------------ 1 file changed, 166 insertions(+), 156 deletions(-) diff --git a/format/spec.md b/format/spec.md index 0d3c79762c6c..94651da0fa86 100644 --- a/format/spec.md +++ b/format/spec.md @@ -600,14 +600,15 @@ A manifest stores files for a single partition spec. When a table’s partition A manifest file must store the partition spec and other metadata as properties in the Avro file's key-value metadata: -| v1 | v2 | Key | Value | -|------------|------------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | `schema` | JSON representation of the table schema at the time the manifest was written | -| _optional_ | _required_ | `schema-id` | ID of the schema used to write the manifest as a string | -| _required_ | _required_ | `partition-spec` | JSON representation of only the partition fields array of the partition spec used to write the manifest. See [Appendix C](#partition-specs) | -| _optional_ | _required_ | `partition-spec-id` | ID of the partition spec used to write the manifest as a string | -| _optional_ | _required_ | `format-version` | Table format version number of the manifest as a string | -| | _required_ | `content` | Type of content files tracked by the manifest: "data" or "deletes" | +=== "v1 - v3" + | v1 | v2 and v3 | Key | Value | + |------------|------------|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------| + | _required_ | _required_ | `schema` | JSON representation of the table schema at the time the manifest was written | + | _optional_ | _required_ | `schema-id` | ID of the schema used to write the manifest as a string | + | _required_ | _required_ | `partition-spec` | JSON representation of only the partition fields array of the partition spec used to write the manifest. See [Appendix C](#partition-specs) | + | _optional_ | _required_ | `partition-spec-id` | ID of the partition spec used to write the manifest as a string | + | _optional_ | _required_ | `format-version` | Table format version number of the manifest as a string | + | | _required_ | `content` | Type of content files tracked by the manifest: "data" or "deletes" | The schema of a manifest file is defined by the `manifest_entry` struct, described in the following section. @@ -615,13 +616,14 @@ The schema of a manifest file is defined by the `manifest_entry` struct, describ The `manifest_entry` struct consists of the following fields: -| v1 | v2 | Field id, name | Type | Description | -| ---------- | ---------- |-------------------------------|-----------------------------------------------------------|-------------| -| _required_ | _required_ | **`0 status`** | `int` with meaning: `0: EXISTING` `1: ADDED` `2: DELETED` | Used to track additions and deletions. Deletes are informational only and not used in scans. | -| _required_ | _optional_ | **`1 snapshot_id`** | `long` | Snapshot id where the file was added, or deleted if status is 2. Inherited when null. | -| | _optional_ | **`3 sequence_number`** | `long` | Data sequence number of the file. Inherited when null and status is 1 (added). | -| | _optional_ | **`4 file_sequence_number`** | `long` | File sequence number indicating when the file was added. Inherited when null and status is 1 (added). | -| _required_ | _required_ | **`2 data_file`** | `data_file` `struct` (see below) | File path, partition tuple, metrics, ... | +=== "v1 - v3" + | v1 | v2 and v3 | Field id, name | Type | Description | + | ---------- | ---------- |-------------------------------|-----------------------------------------------------------|-------------| + | _required_ | _required_ | **`0 status`** | `int` with meaning: `0: EXISTING` `1: ADDED` `2: DELETED` | Used to track additions and deletions. Deletes are informational only and not used in scans. | + | _required_ | _optional_ | **`1 snapshot_id`** | `long` | Snapshot id where the file was added, or deleted if status is 2. Inherited when null. | + | | _optional_ | **`3 sequence_number`** | `long` | Data sequence number of the file. Inherited when null and status is 1 (added). | + | | _optional_ | **`4 file_sequence_number`** | `long` | File sequence number indicating when the file was added. Inherited when null and status is 1 (added). | + | _required_ | _required_ | **`2 data_file`** | `data_file` `struct` (see below) | File path, partition tuple, metrics, ... | The manifest entry fields are used to keep track of the snapshot in which files were added or logically deleted. The `data_file` struct, defined below, is nested inside the manifest entry so that it can be easily passed to job planning without the manifest entry fields. @@ -643,32 +645,33 @@ Notes: The `data_file` struct consists of the following fields: -| v1 | v2 | v3 | Field id, name | Type | Description | -| ---------- |------------|------------|-----------------------------------|-----------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| | _required_ | _required_ | **`134 content`** | `int` with meaning: `0: DATA`, `1: POSITION DELETES`, `2: EQUALITY DELETES` | Type of content stored by the data file: data, equality deletes, or position deletes (all v1 files are data files) | -| _required_ | _required_ | _required_ | **`100 file_path`** | `string` | Full URI for the file with FS scheme | -| _required_ | _required_ | _required_ | **`101 file_format`** | `string` | String file format name, `avro`, `orc`, `parquet`, or `puffin` | -| _required_ | _required_ | _required_ | **`102 partition`** | `struct<...>` | Partition data tuple, schema based on the partition spec output using partition field ids for the struct field ids | -| _required_ | _required_ | _required_ | **`103 record_count`** | `long` | Number of records in this file, or the cardinality of a deletion vector | -| _required_ | _required_ | _required_ | **`104 file_size_in_bytes`** | `long` | Total file size in bytes | -| _required_ | | | ~~**`105 block_size_in_bytes`**~~ | `long` | **Deprecated. Always write a default in v1. Do not write in v2 or v3.** | -| _optional_ | | | ~~**`106 file_ordinal`**~~ | `int` | **Deprecated. Do not write.** | -| _optional_ | | | ~~**`107 sort_columns`**~~ | `list<112: int>` | **Deprecated. Do not write.** | -| _optional_ | _optional_ | _optional_ | **`108 column_sizes`** | `map<117: int, 118: long>` | Map from column id to the total size on disk of all regions that store the column. Does not include bytes necessary to read other columns, like footers. Leave null for row-oriented formats (Avro) | -| _optional_ | _optional_ | _optional_ | **`109 value_counts`** | `map<119: int, 120: long>` | Map from column id to number of values in the column (including null and NaN values) | -| _optional_ | _optional_ | _optional_ | **`110 null_value_counts`** | `map<121: int, 122: long>` | Map from column id to number of null values in the column | -| _optional_ | _optional_ | _optional_ | **`137 nan_value_counts`** | `map<138: int, 139: long>` | Map from column id to number of NaN values in the column | -| _optional_ | _optional_ | | ~~**`111 distinct_counts`**~~ | `map<123: int, 124: long>` | **Deprecated. Do not write.** | -| _optional_ | _optional_ | _optional_ | **`125 lower_bounds`** | `map<126: int, 127: binary>` | Map from column id to lower bound in the column serialized as binary [1]. Each value must be less than or equal to all non-null, non-NaN values in the column for the file [2] | -| _optional_ | _optional_ | _optional_ | **`128 upper_bounds`** | `map<129: int, 130: binary>` | Map from column id to upper bound in the column serialized as binary [1]. Each value must be greater than or equal to all non-null, non-Nan values in the column for the file [2] | -| _optional_ | _optional_ | _optional_ | **`131 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | -| _optional_ | _optional_ | _optional_ | **`132 split_offsets`** | `list<133: long>` | Split offsets for the data file. For example, all row group offsets in a Parquet file. Must be sorted ascending | -| | _optional_ | _optional_ | **`135 equality_ids`** | `list<136: int>` | Field ids used to determine row equality in equality delete files. Required when `content=2` and should be null otherwise. Fields with ids listed in this column must be present in the delete file | -| _optional_ | _optional_ | _optional_ | **`140 sort_order_id`** | `int` | ID representing sort order for this file [3]. | -| | | _optional_ | **`142 first_row_id`** | `long` | The `_row_id` for the first row in the data file. See [First Row ID Inheritance](#first-row-id-inheritance) | -| | _optional_ | _optional_ | **`143 referenced_data_file`** | `string` | Fully qualified location (URI with FS scheme) of a data file that all deletes reference [4] | -| | | _optional_ | **`144 content_offset`** | `long` | The offset in the file where the content starts [5] | -| | | _optional_ | **`145 content_size_in_bytes`** | `long` | The length of a referenced content stored in the file; required if `content_offset` is present [5] | +=== "v1 - v3" + | v1 | v2 | v3 | Field id, name | Type | Description | + | ---------- |------------|------------|-----------------------------------|-----------------------------------------------------------------------------|-------------| + | | _required_ | _required_ | **`134 content`** | `int` with meaning: `0: DATA`, `1: POSITION DELETES`, `2: EQUALITY DELETES` | Type of content stored by the data file: data, equality deletes, or position deletes (all v1 files are data files) | + | _required_ | _required_ | _required_ | **`100 file_path`** | `string` | Full URI for the file with FS scheme | + | _required_ | _required_ | _required_ | **`101 file_format`** | `string` | String file format name, `avro`, `orc`, `parquet`, or `puffin` | + | _required_ | _required_ | _required_ | **`102 partition`** | `struct<...>` | Partition data tuple, schema based on the partition spec output using partition field ids for the struct field ids | + | _required_ | _required_ | _required_ | **`103 record_count`** | `long` | Number of records in this file, or the cardinality of a deletion vector | + | _required_ | _required_ | _required_ | **`104 file_size_in_bytes`** | `long` | Total file size in bytes | + | _required_ | | | ~~**`105 block_size_in_bytes`**~~ | `long` | **Deprecated. Always write a default in v1. Do not write in v2 or v3.** | + | _optional_ | | | ~~**`106 file_ordinal`**~~ | `int` | **Deprecated. Do not write.** | + | _optional_ | | | ~~**`107 sort_columns`**~~ | `list<112: int>` | **Deprecated. Do not write.** | + | _optional_ | _optional_ | _optional_ | **`108 column_sizes`** | `map<117: int, 118: long>` | Map from column id to the total size on disk of all regions that store the column. **Does not include bytes necessary to read other columns, like footers.** Leave null for row-oriented formats (Avro) | + | _optional_ | _optional_ | _optional_ | **`109 value_counts`** | `map<119: int, 120: long>` | Map from column id to number of values in the column (including null and NaN values) | + | _optional_ | _optional_ | _optional_ | **`110 null_value_counts`** | `map<121: int, 122: long>` | Map from column id to number of null values in the column | + | _optional_ | _optional_ | _optional_ | **`137 nan_value_counts`** | `map<138: int, 139: long>` | Map from column id to number of NaN values in the column | + | _optional_ | _optional_ | | ~~**`111 distinct_counts`**~~ | `map<123: int, 124: long>` | **Deprecated. Do not write.** | + | _optional_ | _optional_ | _optional_ | **`125 lower_bounds`** | `map<126: int, 127: binary>` | Map from column id to lower bound in the column serialized as binary [1]. Each value must be less than or equal to all non-null, non-NaN values in the column for the file [2] | + | _optional_ | _optional_ | _optional_ | **`128 upper_bounds`** | `map<129: int, 130: binary>` | Map from column id to upper bound in the column serialized as binary [1]. Each value must be greater than or equal to all non-null, non-Nan values in the column for the file [2] | + | _optional_ | _optional_ | _optional_ | **`131 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | + | _optional_ | _optional_ | _optional_ | **`132 split_offsets`** | `list<133: long>` | Split offsets for the data file. For example, all row group offsets in a Parquet file. Must be sorted ascending | + | | _optional_ | _optional_ | **`135 equality_ids`** | `list<136: int>` | Field ids used to determine row equality in equality delete files. Required when `content=2` and should be null otherwise. Fields with ids listed in this column must be present in the delete file | + | _optional_ | _optional_ | _optional_ | **`140 sort_order_id`** | `int` | ID representing sort order for this file [3]. | + | | | _optional_ | **`142 first_row_id`** | `long` | The `_row_id` for the first row in the data file. See [First Row ID Inheritance](#first-row-id-inheritance) | + | | _optional_ | _optional_ | **`143 referenced_data_file`** | `string` | Fully qualified location (URI with FS scheme) of a data file that all deletes reference [4] | + | | | _optional_ | **`144 content_offset`** | `long` | The offset in the file where the content starts [5] | + | | | _optional_ | **`145 content_size_in_bytes`** | `long` | The length of a referenced content stored in the file; required if `content_offset` is present [5] | The `partition` struct stores the tuple of partition values for each file. Its type is derived from the partition fields of the partition spec used to write the manifest file. In v2, the partition struct's field ids must match the ids from the partition spec. @@ -733,20 +736,20 @@ Any null (unassigned) `first_row_id` must be assigned via inheritance, even if t ### Snapshots A snapshot consists of the following fields: - -| v1 | v2 | v3 | Field | Description | -| ---------- | ---------- |------------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | _required_ | **`snapshot-id`** | A unique long ID | -| _optional_ | _optional_ | _optional_ | **`parent-snapshot-id`** | The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent | -| | _required_ | _required_ | **`sequence-number`** | A monotonically increasing long that tracks the order of changes to a table | -| _required_ | _required_ | _required_ | **`timestamp-ms`** | A timestamp when the snapshot was created, used for garbage collection and table inspection | -| _optional_ | _required_ | _required_ | **`manifest-list`** | The location of a manifest list for this snapshot that tracks manifest files with additional metadata | -| _optional_ | | | **`manifests`** | A list of manifest file locations. Must be omitted if `manifest-list` is present | -| _optional_ | _required_ | _required_ | **`summary`** | A string map that summarizes the snapshot changes, including `operation` as a _required_ field (see below) | -| _optional_ | _optional_ | _optional_ | **`schema-id`** | ID of the table's current schema when the snapshot was created | -| | | _required_ | **`first-row-id`** | The first `_row_id` assigned to the first row in the first data file in the first manifest, see [Row Lineage](#row-lineage) | -| | | _required_ | **`added-rows`** | The upper bound of the number of rows with assigned row IDs, see [Row Lineage](#row-lineage) | -| | | _optional_ | **`key-id`** | ID of the encryption key that encrypts the manifest list key metadata | +=== "v1 - v3" + | v1 | v2 | v3 | Field | Description | + | ---------- | ---------- |------------|------------------------------|-------------| + | _required_ | _required_ | _required_ | **`snapshot-id`** | A unique long ID | + | _optional_ | _optional_ | _optional_ | **`parent-snapshot-id`** | The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent | + | | _required_ | _required_ | **`sequence-number`** | A monotonically increasing long that tracks the order of changes to a table | + | _required_ | _required_ | _required_ | **`timestamp-ms`** | A timestamp when the snapshot was created, used for garbage collection and table inspection | + | _optional_ | _required_ | _required_ | **`manifest-list`** | The location of a manifest list for this snapshot that tracks manifest files with additional metadata | + | _optional_ | | | **`manifests`** | A list of manifest file locations. Must be omitted if `manifest-list` is present | + | _optional_ | _required_ | _required_ | **`summary`** | A string map that summarizes the snapshot changes, including `operation` as a _required_ field (see below) | + | _optional_ | _optional_ | _optional_ | **`schema-id`** | ID of the table's current schema when the snapshot was created | + | | | _required_ | **`first-row-id`** | The first `_row_id` assigned to the first row in the first data file in the first manifest, see [Row Lineage](#row-lineage) | + | | | _required_ | **`added-rows`** | The upper bound of the number of rows with assigned row IDs, see [Row Lineage](#row-lineage) | + | | | _optional_ | **`key-id`** | ID of the encryption key that encrypts the manifest list key metadata | The snapshot summary's `operation` field is used by some operations, like snapshot expiration, to skip processing certain snapshots. Possible `operation` values are: @@ -790,33 +793,34 @@ A manifest list is a valid Iceberg data file: files must use valid Iceberg forma Manifest list files store `manifest_file`, a struct with the following fields: -| v1 | v2 | v3 | Field id, name | Type | Description | -| ---------- | ---------- |------------|----------------------------------|---------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | _required_ | **`500 manifest_path`** | `string` | Location of the manifest file | -| _required_ | _required_ | _required_ | **`501 manifest_length`** | `long` | Length of the manifest file in bytes | -| _required_ | _required_ | _required_ | **`502 partition_spec_id`** | `int` | ID of a partition spec used to write the manifest; must be listed in table metadata `partition-specs` | -| | _required_ | _required_ | **`517 content`** | `int` with meaning: `0: data`, `1: deletes` | The type of files tracked by the manifest, either data or delete files; 0 for all v1 manifests | -| | _required_ | _required_ | **`515 sequence_number`** | `long` | The sequence number when the manifest was added to the table; use 0 when reading v1 manifest lists | -| | _required_ | _required_ | **`516 min_sequence_number`** | `long` | The minimum data sequence number of all live data or delete files in the manifest; use 0 when reading v1 manifest lists | -| _required_ | _required_ | _required_ | **`503 added_snapshot_id`** | `long` | ID of the snapshot where the manifest file was added | -| _optional_ | _required_ | _required_ | **`504 added_files_count`** | `int` | Number of entries in the manifest that have status `ADDED` (1), when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`505 existing_files_count`** | `int` | Number of entries in the manifest that have status `EXISTING` (0), when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`506 deleted_files_count`** | `int` | Number of entries in the manifest that have status `DELETED` (2), when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`512 added_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `ADDED`, when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`513 existing_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `EXISTING`, when `null` this is assumed to be non-zero | -| _optional_ | _required_ | _required_ | **`514 deleted_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `DELETED`, when `null` this is assumed to be non-zero | -| _optional_ | _optional_ | _optional_ | **`507 partitions`** | `list<508: field_summary>` (see below) | A list of field summaries for each partition field in the spec. Each field in the list corresponds to a field in the manifest file’s partition spec. | -| _optional_ | _optional_ | _optional_ | **`519 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | -| | | _optional_ | **`520 first_row_id`** | `long` | The starting `_row_id` to assign to rows added by `ADDED` data files [First Row ID Assignment](#first-row-id-assignment) | +=== "v1 - v3" + | v1 | v2 | v3 | Field id, name | Type | Description | + | ---------- | ---------- |------------|-------------------------------------|---------------------------------------------|-------------| + | _required_ | _required_ | _required_ | **`500 manifest_path`** | `string` | Location of the manifest file | + | _required_ | _required_ | _required_ | **`501 manifest_length`** | `long` | Length of the manifest file in bytes | + | _required_ | _required_ | _required_ | **`502 partition_spec_id`** | `int` | ID of a partition spec used to write the manifest; must be listed in table metadata `partition-specs` | + | | _required_ | _required_ | **`517 content`** | `int` with meaning: `0: data`, `1: deletes` | The type of files tracked by the manifest, either data or delete files; 0 for all v1 manifests | + | | _required_ | _required_ | **`515 sequence_number`** | `long` | The sequence number when the manifest was added to the table; use 0 when reading v1 manifest lists | + | | _required_ | _required_ | **`516 min_sequence_number`** | `long` | The minimum data sequence number of all live data or delete files in the manifest; use 0 when reading v1 manifest lists | + | _required_ | _required_ | _required_ | **`503 added_snapshot_id`** | `long` | ID of the snapshot where the manifest file was added | + | _optional_ | _required_ | _required_ | **`504 added_files_count`** | `int` | Number of entries in the manifest that have status `ADDED` (1), when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`505 existing_files_count`** | `int` | Number of entries in the manifest that have status `EXISTING` (0), when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`506 deleted_files_count`** | `int` | Number of entries in the manifest that have status `DELETED` (2), when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`512 added_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `ADDED`, when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`513 existing_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `EXISTING`, when `null` this is assumed to be non-zero | + | _optional_ | _required_ | _required_ | **`514 deleted_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `DELETED`, when `null` this is assumed to be non-zero | + | _optional_ | _optional_ | _optional_ | **`507 partitions`** | `list<508: field_summary>` **(see below)** | A list of field summaries for each partition field in the spec. Each field in the list corresponds to a field in the manifest file’s partition spec. | + | _optional_ | _optional_ | _optional_ | **`519 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | + | | | _optional_ | **`520 first_row_id`** | `long` | The starting `_row_id` to assign to rows added by `ADDED` data files [First Row ID Assignment](#first-row-id-assignment) | `field_summary` is a struct with the following fields: - -| v1 | v2 | Field id, name | Type | Description | -| ---------- | ---------- |-------------------------|---------------|-------------| -| _required_ | _required_ | **`509 contains_null`** | `boolean` | Whether the manifest contains at least one partition with a null value for the field | -| _optional_ | _optional_ | **`518 contains_nan`** | `boolean` | Whether the manifest contains at least one partition with a NaN value for the field | -| _optional_ | _optional_ | **`510 lower_bound`** | `bytes` [1] | Lower bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | -| _optional_ | _optional_ | **`511 upper_bound`** | `bytes` [1] | Upper bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | +=== "v1 - v3" + | v1 | v2 and v3 | Field id, name | Type | Description | + | ---------- | ---------- |-------------------------|---------------|-------------| + | _required_ | _required_ | **`509 contains_null`** | `boolean` | Whether the manifest contains at least one partition with a null value for the field | + | _optional_ | _optional_ | **`518 contains_nan`** | `boolean` | Whether the manifest contains at least one partition with a NaN value for the field | + | _optional_ | _optional_ | **`510 lower_bound`** | `bytes` [1] | Lower bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | + | _optional_ | _optional_ | **`511 upper_bound`** | `bytes` [1] | Upper bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | Notes: @@ -885,13 +889,14 @@ Tags are labels for individual snapshots. Branches are mutable named references The snapshot reference object records all the information of a reference including snapshot ID, reference type and [Snapshot Retention Policy](#snapshot-retention-policy). -| v1 | v2 | Field name | Type | Description | -| ---------- | ---------- | ---------------------------- | --------- | ----------- | -| _required_ | _required_ | **`snapshot-id`** | `long` | A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. | -| _required_ | _required_ | **`type`** | `string` | Type of the reference, `tag` or `branch` | -| _optional_ | _optional_ | **`min-snapshots-to-keep`** | `int` | For `branch` type only, a positive number for the minimum number of snapshots to keep in a branch while expiring snapshots. Defaults to table property `history.expire.min-snapshots-to-keep`. | -| _optional_ | _optional_ | **`max-snapshot-age-ms`** | `long` | For `branch` type only, a positive number for the max age of snapshots to keep when expiring, including the latest snapshot. Defaults to table property `history.expire.max-snapshot-age-ms`. | -| _optional_ | _optional_ | **`max-ref-age-ms`** | `long` | For snapshot references except the `main` branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots. Defaults to table property `history.expire.max-ref-age-ms`. The `main` branch never expires. | +=== "v1 - v3" + | v1 | v2 and v3 | Field name | Type | Description | + | ---------- | ---------- | ---------------------------- | --------- | ----------- | + | _required_ | _required_ | **`snapshot-id`** | `long` | A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. | + | _required_ | _required_ | **`type`** | `string` | Type of the reference, `tag` or `branch` | + | _optional_ | _optional_ | **`min-snapshots-to-keep`** | `int` | For `branch` type only, a positive number for the minimum number of snapshots to keep in a branch while expiring snapshots. Defaults to table property `history.expire.min-snapshots-to-keep`. | + | _optional_ | _optional_ | **`max-snapshot-age-ms`** | `long` | For `branch` type only, a positive number for the max age of snapshots to keep when expiring, including the latest snapshot. Defaults to table property `history.expire.max-snapshot-age-ms`. | + | _optional_ | _optional_ | **`max-ref-age-ms`** | `long` | For snapshot references except the `main` branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots. Defaults to table property `history.expire.max-ref-age-ms`. The `main` branch never expires. | Valid snapshot references are stored as the values of the `refs` map in table metadata. For serialization, see Appendix C. @@ -921,33 +926,34 @@ The atomic operation used to commit metadata depends on how tables are tracked a Table metadata consists of the following fields: -| v1 | v2 | v3 | Field | Description | -| ---------- | ---------- |------------|-----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| _required_ | _required_ | _required_ | **`format-version`** | An integer version number for the format. Implementations must throw an exception if a table's version is higher than the supported version. | -| _optional_ | _required_ | _required_ | **`table-uuid`** | A UUID that identifies the table, generated when the table is created. Implementations must throw an exception if a table's UUID does not match the expected UUID after refreshing metadata. | -| _required_ | _required_ | _required_ | **`location`** | The table's base location. This is used by writers to determine where to store data files, manifest files, and table metadata files. | -| | _required_ | _required_ | **`last-sequence-number`** | The table's highest assigned sequence number, a monotonically increasing long that tracks the order of snapshots in a table. | -| _required_ | _required_ | _required_ | **`last-updated-ms`** | Timestamp in milliseconds from the unix epoch when the table was last updated. Each table metadata file should update this field just before writing. | -| _required_ | _required_ | _required_ | **`last-column-id`** | An integer; the highest assigned column ID for the table. This is used to ensure columns are always assigned an unused ID when evolving schemas. | -| _required_ | | | **`schema`** | The table’s current schema. (**Deprecated**: use `schemas` and `current-schema-id` instead) | -| _optional_ | _required_ | _required_ | **`schemas`** | A list of schemas, stored as objects with `schema-id`. | -| _optional_ | _required_ | _required_ | **`current-schema-id`** | ID of the table's current schema. | -| _required_ | | | **`partition-spec`** | The table’s current partition spec, stored as only fields. Note that this is used by writers to partition data, but is not used when reading because reads use the specs stored in manifest files. (**Deprecated**: use `partition-specs` and `default-spec-id` instead) | -| _optional_ | _required_ | _required_ | **`partition-specs`** | A list of partition specs, stored as full partition spec objects. | -| _optional_ | _required_ | _required_ | **`default-spec-id`** | ID of the "current" spec that writers should use by default. | -| _optional_ | _required_ | _required_ | **`last-partition-id`** | An integer; the highest assigned partition field ID across all partition specs for the table. This is used to ensure partition fields are always assigned an unused ID when evolving specs. | -| _optional_ | _optional_ | _optional_ | **`properties`** | A string to string map of table properties. This is used to control settings that affect reading and writing and is not intended to be used for arbitrary metadata. For example, `commit.retry.num-retries` is used to control the number of commit retries. | -| _optional_ | _optional_ | _optional_ | **`current-snapshot-id`** | `long` ID of the current table snapshot; must be the same as the current ID of the `main` branch in `refs`. | -| _optional_ | _optional_ | _optional_ | **`snapshots`** | A list of valid snapshots. Valid snapshots are snapshots for which all data files exist in the file system. A data file must not be deleted from the file system until the last snapshot in which it was listed is garbage collected. | -| _optional_ | _optional_ | _optional_ | **`snapshot-log`** | A list (optional) of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table. Each time the current-snapshot-id is changed, a new entry should be added with the last-updated-ms and the new current-snapshot-id. When snapshots are expired from the list of valid snapshots, all entries before a snapshot that has expired should be removed. | -| _optional_ | _optional_ | _optional_ | **`metadata-log`** | A list (optional) of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table. Each time a new metadata file is created, a new entry of the previous metadata file location should be added to the list. Tables can be configured to remove oldest metadata log entries and keep a fixed-size log of the most recent entries after a commit. | -| _optional_ | _required_ | _required_ | **`sort-orders`** | A list of sort orders, stored as full sort order objects. | -| _optional_ | _required_ | _required_ | **`default-sort-order-id`** | Default sort order id of the table. Note that this could be used by writers, but is not used when reading because reads use the specs stored in manifest files. | -| | _optional_ | _optional_ | **`refs`** | A map of snapshot references. The map keys are the unique snapshot reference names in the table, and the map values are snapshot reference objects. There is always a `main` branch reference pointing to the `current-snapshot-id` even if the `refs` map is null. | -| _optional_ | _optional_ | _optional_ | **`statistics`** | A list (optional) of [table statistics](#table-statistics). | -| _optional_ | _optional_ | _optional_ | **`partition-statistics`** | A list (optional) of [partition statistics](#partition-statistics). | -| | | _required_ | **`next-row-id`** | A `long` higher than all assigned row IDs; the next snapshot's `first-row-id`. See [Row Lineage](#row-lineage). | -| | | _optional_ | **`encryption-keys`** | A list (optional) of [encryption keys](#encryption-keys) used for table encryption. | +=== "v1 - v3" + | v1 | v2 | v3 | Field | Description | + | ---------- | ---------- |------------|-----------------------------| ------------| + | _required_ | _required_ | _required_ | **`format-version`** | An integer version number for the format. Implementations must throw an exception if a table’s version is higher than the supported version. | + | _optional_ | _required_ | _required_ | **`table-uuid`** | A UUID that identifies the table, generated when the table is created. Implementations must throw an exception if a table’s UUID does not match the expected UUID after refreshing metadata. | + | _required_ | _required_ | _required_ | **`location`** | The table’s base location. This is used by writers to determine where to store data files, manifest files, and table metadata files. | + | | _required_ | _required_ | **`last-sequence-number`** | The table’s highest assigned sequence number, a monotonically increasing long that tracks the order of snapshots in a table. | + | _required_ | _required_ | _required_ | **`last-updated-ms`** | Timestamp in milliseconds from the unix epoch when the table was last updated. Each table metadata file should update this field just before writing. | + | _required_ | _required_ | _required_ | **`last-column-id`** | An integer; the highest assigned column ID for the table. This is used to ensure columns are always assigned an unused ID when evolving schemas. | + | _required_ | | | **`schema`** | The table’s current schema. (**Deprecated**: use `schemas` and `current-schema-id` instead) | + | _optional_ | _required_ | _required_ | **`schemas`** | A list of schemas, stored as objects with `schema-id`. | + | _optional_ | _required_ | _required_ | **`current-schema-id`** | ID of the table’s current schema. | + | _required_ | | | **`partition-spec`** | The table’s current partition spec, stored as only fields. (**Deprecated**: use `partition-specs` and `default-spec-id` instead) Note that this is used by writers to partition data, but is not used when reading because reads use the specs stored in manifest files. | + | _optional_ | _required_ | _required_ | **`partition-specs`** | A list of partition specs, stored as full partition spec objects. | + | _optional_ | _required_ | _required_ | **`default-spec-id`** | ID of the "current" spec that writers should use by default. | + | _optional_ | _required_ | _required_ | **`last-partition-id`** | An integer; the highest assigned partition field ID across all partition specs for the table. This is used to ensure partition fields are always assigned an unused ID when evolving specs. | + | _optional_ | _optional_ | _optional_ | **`properties`** | A string to string map of table properties. This is used to control settings that affect reading and writing and is not intended to be used for arbitrary metadata. For example, `commit.retry.num-retries` is used to control the number of commit retries. | + | _optional_ | _optional_ | _optional_ | **`current-snapshot-id`** | `long` ID of the current table snapshot; must be the same as the current ID of the `main` branch in `refs`. | + | _optional_ | _optional_ | _optional_ | **`snapshots`** | A list of valid snapshots. Valid snapshots are snapshots for which all data files exist in the file system. A data file must not be deleted from the file system until the last snapshot in which it was listed is garbage collected. | + | _optional_ | _optional_ | _optional_ | **`snapshot-log`** | A list (optional) of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table. Each time the current-snapshot-id is changed, a new entry should be added with the last-updated-ms and the new current-snapshot-id. When snapshots are expired from the list of valid snapshots, all entries before a snapshot that has expired should be removed. | + | _optional_ | _optional_ | _optional_ | **`metadata-log`** | A list (optional) of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table. Each time a new metadata file is created, a new entry of the previous metadata file location should be added to the list. Tables can be configured to remove oldest metadata log entries and keep a fixed-size log of the most recent entries after a commit. | + | _optional_ | _required_ | _required_ | **`sort-orders`** | A list of sort orders, stored as full sort order objects. | + | _optional_ | _required_ | _required_ | **`default-sort-order-id`** | Default sort order id of the table. Note that this could be used by writers, but is not used when reading because reads use the specs stored in manifest files. | + | | _optional_ | _optional_ | **`refs`** | A map of snapshot references. The map keys are the unique snapshot reference names in the table, and the map values are snapshot reference objects. There is always a `main` branch reference pointing to the `current-snapshot-id` even if the `refs` map is null. | + | _optional_ | _optional_ | _optional_ | **`statistics`** | A list (optional) of [table statistics](#table-statistics). | + | _optional_ | _optional_ | _optional_ | **`partition-statistics`** | A list (optional) of [partition statistics](#partition-statistics). | + | | | _required_ | **`next-row-id`** | A `long` higher than all assigned row IDs; the next snapshot’s `first-row-id`. See [Row Lineage](#row-lineage). | + | | | _optional_ | **`encryption-keys`** | A list (optional) of [encryption keys](#encryption-keys) used for table encryption. | For serialization details, see Appendix C. @@ -963,24 +969,26 @@ many statistics files associated with different table snapshots. Statistics files metadata within `statistics` table metadata field is a struct with the following fields: -| v1 | v2 | Field name | Type | Description | -|----|----|------------|------|-------------| -| _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the statistics file is associated with. | -| _required_ | _required_ | **`statistics-path`** | `string` | Path of the statistics file. See [Puffin file format](puffin-spec.md). | -| _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the statistics file. | -| _required_ | _required_ | **`file-footer-size-in-bytes`** | `long` | Total size of the statistics file's footer (not the footer payload size). See [Puffin file format](puffin-spec.md) for footer definition. | -| _optional_ | _optional_ | **`key-metadata`** | Base64-encoded implementation-specific key metadata for encryption. | -| _required_ | _required_ | **`blob-metadata`** | `list` (see below) | A list of the blob metadata for statistics contained in the file with structure described below. | +=== "v1 - v3" + | v1 | v2 and v3 | Field name | Type | Description | + | ---------- | ---------- |---------------------------------|-----------------------|-------------| + | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the statistics file is associated with. | + | _required_ | _required_ | **`statistics-path`** | `string` | Path of the statistics file. See [Puffin file format](puffin-spec.md). | + | _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the statistics file. | + | _required_ | _required_ | **`file-footer-size-in-bytes`** | `long` | Total size of the statistics file's footer (not the footer payload size). See [Puffin file format](puffin-spec.md) for footer definition. | + | _optional_ | _optional_ | **`key-metadata`** | | Base64-encoded implementation-specific key metadata for encryption. | + | _required_ | _required_ | **`blob-metadata`** | `list` (see below) | A list of the blob metadata for statistics contained in the file with structure described below. | Blob metadata is a struct with the following fields: -| v1 | v2 | Field name | Type | Description | -|----|----|------------|------|-------------| -| _required_ | _required_ | **`type`** | `string` | Type of the blob. Matches Blob type in the Puffin file. | -| _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the blob was computed from. | -| _required_ | _required_ | **`sequence-number`** | `long` | Sequence number of the Iceberg table's snapshot the blob was computed from. | -| _required_ | _required_ | **`fields`** | `list` | Ordered list of fields, given by field ID, on which the statistic was calculated. | -| _optional_ | _optional_ | **`properties`** | `map` | Additional properties associated with the statistic. Subset of Blob properties in the Puffin file. | +=== "v1 - v3" + | v1 | v2 and v3 | Field name | Type | Description | + | ---------- | ---------- |-----------------------|-----------------------|-------------| + | _required_ | _required_ | **`type`** | `string` | Type of the blob. Matches Blob type in the Puffin file. | + | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the blob was computed from. | + | _required_ | _required_ | **`sequence-number`** | `long` | Sequence number of the Iceberg table's snapshot the blob was computed from. | + | _required_ | _required_ | **`fields`** | `list` | Ordered list of fields, given by field ID, on which the statistic was calculated. | + | _optional_ | _optional_ | **`properties`** | `map` | Additional properties associated with the statistic. Subset of Blob properties in the Puffin file. | #### Partition Statistics @@ -992,11 +1000,12 @@ Partition statistics file must be registered in the table metadata file to be co `partition-statistics` field of table metadata is an optional list of structs with the following fields: -| v1 | v2 | v3 | Field name | Type | Description | -|----|----|----|------------|------|-------------| -| _required_ | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the partition statistics file is associated with. | -| _required_ | _required_ | _required_ | **`statistics-path`** | `string` | Path of the partition statistics file. See [Partition statistics file](#partition-statistics-file). | -| _required_ | _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the partition statistics file. | +=== "v1 - v3" + | v1 | v2 | v3 | Field name | Type | Description | + | ---------- | ---------- |------------|--------------------------|----------|-------------| + | _required_ | _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the partition statistics file is associated with. | + | _required_ | _required_ | _required_ | **`statistics-path`** | `string` | Path of the partition statistics file. See [Partition statistics file](#partition-statistics-file). | + | _required_ | _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the partition statistics file. | ##### Partition Statistics File @@ -1005,21 +1014,22 @@ These rows must be sorted (in ascending manner with NULL FIRST) by `partition` f The schema of the partition statistics file is as follows: -| v1 | v2 | v3 | Field id, name | Type | Description | -|----|----|----|----------------|------|-------------| -| _required_ | _required_ | _required_ | **`1 partition`** | `struct<..>` | Partition data tuple, schema based on the unified partition type considering all specs in a table | -| _required_ | _required_ | _required_ | **`2 spec_id`** | `int` | Partition spec id | -| _required_ | _required_ | _required_ | **`3 data_record_count`** | `long` | Count of records in data files | -| _required_ | _required_ | _required_ | **`4 data_file_count`** | `int` | Count of data files | -| _required_ | _required_ | _required_ | **`5 total_data_file_size_in_bytes`** | `long` | Total size of data files in bytes | -| _optional_ | _optional_ | _required_ | **`6 position_delete_record_count`** | `long` | Count of position deletes across position delete files and deletion vectors | -| _optional_ | _optional_ | _required_ | **`7 position_delete_file_count`** | `int` | Count of position delete files ignoring deletion vectors | -| | | _required_ | **`13 dv_count`** | `int` | Count of deletion vectors | -| _optional_ | _optional_ | _required_ | **`8 equality_delete_record_count`** | `long` | Count of records in equality delete files | -| _optional_ | _optional_ | _required_ | **`9 equality_delete_file_count`** | `int` | Count of equality delete files | -| _optional_ | _optional_ | _optional_ | **`10 total_record_count`** | `long` | Accurate count of records in a partition after applying deletes if any | -| _optional_ | _optional_ | _optional_ | **`11 last_updated_at`** | `long` | Timestamp in milliseconds from the unix epoch when the partition was last updated | -| _optional_ | _optional_ | _optional_ | **`12 last_updated_snapshot_id`** | `long` | ID of snapshot that last updated this partition | +=== "v1 - v3" + | v1 | v2 | v3 | Field id, name | Type | Description | + | ---------- | ---------- |------------|------------------------------------------|--------------|-------------| + | _required_ | _required_ | _required_ | **`1 partition`** | `struct<..>` | Partition data tuple, schema based on the unified partition type considering all specs in a table | + | _required_ | _required_ | _required_ | **`2 spec_id`** | `int` | Partition spec id | + | _required_ | _required_ | _required_ | **`3 data_record_count`** | `long` | Count of records in data files | + | _required_ | _required_ | _required_ | **`4 data_file_count`** | `int` | Count of data files | + | _required_ | _required_ | _required_ | **`5 total_data_file_size_in_bytes`** | `long` | Total size of data files in bytes | + | _optional_ | _optional_ | _required_ | **`6 position_delete_record_count`** | `long` | Count of position deletes across position delete files and deletion vectors | + | _optional_ | _optional_ | _required_ | **`7 position_delete_file_count`** | `int` | Count of position delete files ignoring deletion vectors | + | | | _required_ | **`13 dv_count`** | `int` | Count of deletion vectors | + | _optional_ | _optional_ | _required_ | **`8 equality_delete_record_count`** | `long` | Count of records in equality delete files | + | _optional_ | _optional_ | _required_ | **`9 equality_delete_file_count`** | `int` | Count of equality delete files | + | _optional_ | _optional_ | _optional_ | **`10 total_record_count`** | `long` | Accurate count of records in a partition after applying deletes if any | + | _optional_ | _optional_ | _optional_ | **`11 last_updated_at`** | `long` | Timestamp in milliseconds from the unix epoch when the partition was last updated | + | _optional_ | _optional_ | _optional_ | **`12 last_updated_snapshot_id`** | `long` | ID of snapshot that last updated this partition | Note that partition data tuple's schema is based on the partition spec output using partition field ids for the struct field ids. The unified partition type is a struct containing all fields that have ever been a part of any spec in the table @@ -1044,13 +1054,13 @@ If a table has no deletes or only deletion vectors, implementations are encourag #### Encryption Keys Keys used for table encryption can be tracked in table metadata as a list named `encryption-keys`. The schema of each key is a struct with the following fields: - -| v1 | v2 | v3 | Field name | Type. | Description | -|----|----|------------|------------------------------|-----------------------|-------------| -| | | _required_ | **`key-id`** | `string` | ID of the encryption key | -| | | _required_ | **`encrypted-key-metadata`** | `string` | Encrypted key and metadata, base64 encoded [1] | -| | | _optional_ | **`encrypted-by-id`** | `string` | Optional ID of the key used to encrypt or wrap `key-metadata` | -| | | _optional_ | **`properties`** | `map` | A string to string map of additional metadata used by the table's encryption scheme | +=== "v1 - v3" + | v1 | v2 | v3 | Field name | Type | Description | + |----|----|------------|-------------------------------|-----------------------|-------------| + | | | _required_ | **`key-id`** | `string` | ID of the encryption key | + | | | _required_ | **`encrypted-key-metadata`** | `string` | Encrypted key and metadata, base64 encoded [1] | + | | | _optional_ | **`encrypted-by-id`** | `string` | Optional ID of the key used to encrypt or wrap `key-metadata` | + | | | _optional_ | **`properties`** | `map` | A string to string map of additional metadata used by the table's encryption scheme | Notes: From 9b8bde411f81af88fb87457ebe3b78c62faee2eb Mon Sep 17 00:00:00 2001 From: pvary Date: Fri, 8 May 2026 20:29:35 +0200 Subject: [PATCH 185/197] ORC: Backport add _row_id and _last_updated_sequence_number raeder in Orc to support lineage (#16256) backports #15776 --- .../iceberg/flink/data/FlinkOrcReader.java | 2 +- .../iceberg/flink/data/FlinkOrcReaders.java | 15 ++++-- .../maintenance/api/TestRewriteDataFiles.java | 51 +++++++++++-------- .../operator/OperatorTestBase.java | 35 ++++++++++--- .../iceberg/flink/data/FlinkOrcReader.java | 2 +- .../iceberg/flink/data/FlinkOrcReaders.java | 15 ++++-- .../maintenance/api/TestRewriteDataFiles.java | 51 +++++++++++-------- .../operator/OperatorTestBase.java | 35 ++++++++++--- 8 files changed, 144 insertions(+), 62 deletions(-) diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 3e3a29112cf4..77f16bfdb2ab 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -70,7 +70,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + return FlinkOrcReaders.struct(record, fields, iStruct, idToConstant); } @Override diff --git a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 7a4a15c7e600..c5c958fbdb04 100644 --- a/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v1.20/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -39,6 +39,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -91,8 +92,11 @@ public static OrcValueReader map( } public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } private static class StringReader implements OrcValueReader { @@ -265,8 +269,11 @@ private static class StructReader extends OrcValueReaders.StructReader private final int numFields; StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index 97b8b6786545..88b949a9a7f8 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -44,8 +45,14 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.FieldSource; class TestRewriteDataFiles extends MaintenanceTaskTestBase { + + private static final FileFormat[] FILE_FORMATS = + new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; + @Test void testRewriteUnpartitioned() throws Exception { Table table = createTable(); @@ -83,13 +90,14 @@ void testRewriteUnpartitioned() throws Exception { createRecord(4, "d"))); } - @Test - void testRewriteUnpartitionedPreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteUnpartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); + insert(table, 3, "c", fileFormat); + insert(table, 4, "d", fileFormat); assertFileNum(table, 4, 0); @@ -123,15 +131,17 @@ void testRewriteUnpartitionedPreserveLineage() throws Exception { schema); } - @Test - void testRewriteTheSameFilePreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteTheSameFilePreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); // Create a file with two lines of data to verify that the rowid is read correctly. insert( table, - ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d"))); + ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d")), + fileFormat); assertFileNum(table, 3, 0); @@ -167,13 +177,14 @@ void testRewriteTheSameFilePreserveLineage() throws Exception { schema); } - @Test - void testRewritePartitionedPreserveLineage() throws Exception { - Table table = createPartitionedTable(3); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewritePartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createPartitionedTable(3, fileFormat); + insertPartitioned(table, 1, "p1", fileFormat); + insertPartitioned(table, 2, "p1", fileFormat); + insertPartitioned(table, 3, "p2", fileFormat); + insertPartitioned(table, 4, "p2", fileFormat); assertFileNum(table, 4, 0); diff --git a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index 93291e8cc29a..06ab7861c0f5 100644 --- a/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v1.20/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -133,10 +133,14 @@ void after() throws IOException { } protected static Table createTable() { - return createTable(2); + return createTable(2, FileFormat.PARQUET); } protected static Table createTable(int formatVersion) { + return createPartitionedTable(formatVersion, FileFormat.PARQUET); + } + + protected static Table createTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -145,6 +149,8 @@ protected static Table createTable(int formatVersion) { PartitionSpec.unpartitioned(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), TableProperties.FORMAT_VERSION, String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -182,7 +188,7 @@ protected static Table createTableWithDelete(int formatVersion) { "format-version", String.valueOf(formatVersion), "write.upsert.enabled", "true")); } - protected static Table createPartitionedTable(int formatVersion) { + protected static Table createPartitionedTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -191,6 +197,8 @@ protected static Table createPartitionedTable(int formatVersion) { PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), "format-version", String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -198,17 +206,27 @@ protected static Table createPartitionedTable(int formatVersion) { } protected static Table createPartitionedTable() { - return createPartitionedTable(2); + return createPartitionedTable(2, FileFormat.PARQUET); } protected void insert(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insert(table, id, data, FileFormat.PARQUET); + } + + protected void insert(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); } protected void insert(Table table, List records) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir).appendToTable(records); + insert(table, records, FileFormat.PARQUET); + } + + protected void insert(Table table, List records, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir).appendToTable(records); table.refresh(); } @@ -309,7 +327,12 @@ protected void update(Table table, Integer id, String oldData, String tempData, } protected void insertPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insertPartitioned(table, id, data, FileFormat.PARQUET); + } + + protected void insertPartitioned(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable( TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java index 3e3a29112cf4..77f16bfdb2ab 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReader.java @@ -70,7 +70,7 @@ public OrcValueReader record( TypeDescription record, List names, List> fields) { - return FlinkOrcReaders.struct(fields, iStruct, idToConstant); + return FlinkOrcReaders.struct(record, fields, iStruct, idToConstant); } @Override diff --git a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java index 7a4a15c7e600..c5c958fbdb04 100644 --- a/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java +++ b/flink/v2.0/flink/src/main/java/org/apache/iceberg/flink/data/FlinkOrcReaders.java @@ -39,6 +39,7 @@ import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.apache.orc.TypeDescription; import org.apache.orc.storage.ql.exec.vector.BytesColumnVector; import org.apache.orc.storage.ql.exec.vector.ColumnVector; import org.apache.orc.storage.ql.exec.vector.DecimalColumnVector; @@ -91,8 +92,11 @@ public static OrcValueReader map( } public static OrcValueReader struct( - List> readers, Types.StructType struct, Map idToConstant) { - return new StructReader(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + return new StructReader(record, readers, struct, idToConstant); } private static class StringReader implements OrcValueReader { @@ -265,8 +269,11 @@ private static class StructReader extends OrcValueReaders.StructReader private final int numFields; StructReader( - List> readers, Types.StructType struct, Map idToConstant) { - super(readers, struct, idToConstant); + TypeDescription record, + List> readers, + Types.StructType struct, + Map idToConstant) { + super(record, readers, struct, idToConstant); this.numFields = struct.fields().size(); } diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java index 97b8b6786545..88b949a9a7f8 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/api/TestRewriteDataFiles.java @@ -33,6 +33,7 @@ import java.util.List; import java.util.stream.StreamSupport; import org.apache.flink.streaming.api.graph.StreamGraphGenerator; +import org.apache.iceberg.FileFormat; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; @@ -44,8 +45,14 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.FieldSource; class TestRewriteDataFiles extends MaintenanceTaskTestBase { + + private static final FileFormat[] FILE_FORMATS = + new FileFormat[] {FileFormat.AVRO, FileFormat.PARQUET, FileFormat.ORC}; + @Test void testRewriteUnpartitioned() throws Exception { Table table = createTable(); @@ -83,13 +90,14 @@ void testRewriteUnpartitioned() throws Exception { createRecord(4, "d"))); } - @Test - void testRewriteUnpartitionedPreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); - insert(table, 3, "c"); - insert(table, 4, "d"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteUnpartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); + insert(table, 3, "c", fileFormat); + insert(table, 4, "d", fileFormat); assertFileNum(table, 4, 0); @@ -123,15 +131,17 @@ void testRewriteUnpartitionedPreserveLineage() throws Exception { schema); } - @Test - void testRewriteTheSameFilePreserveLineage() throws Exception { - Table table = createTable(3); - insert(table, 1, "a"); - insert(table, 2, "b"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewriteTheSameFilePreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createTable(3, fileFormat); + insert(table, 1, "a", fileFormat); + insert(table, 2, "b", fileFormat); // Create a file with two lines of data to verify that the rowid is read correctly. insert( table, - ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d"))); + ImmutableList.of(SimpleDataUtil.createRecord(3, "c"), SimpleDataUtil.createRecord(4, "d")), + fileFormat); assertFileNum(table, 3, 0); @@ -167,13 +177,14 @@ void testRewriteTheSameFilePreserveLineage() throws Exception { schema); } - @Test - void testRewritePartitionedPreserveLineage() throws Exception { - Table table = createPartitionedTable(3); - insertPartitioned(table, 1, "p1"); - insertPartitioned(table, 2, "p1"); - insertPartitioned(table, 3, "p2"); - insertPartitioned(table, 4, "p2"); + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testRewritePartitionedPreserveLineage(FileFormat fileFormat) throws Exception { + Table table = createPartitionedTable(3, fileFormat); + insertPartitioned(table, 1, "p1", fileFormat); + insertPartitioned(table, 2, "p1", fileFormat); + insertPartitioned(table, 3, "p2", fileFormat); + insertPartitioned(table, 4, "p2", fileFormat); assertFileNum(table, 4, 0); diff --git a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java index 6dd6cda84f27..d6563e782e43 100644 --- a/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java +++ b/flink/v2.0/flink/src/test/java/org/apache/iceberg/flink/maintenance/operator/OperatorTestBase.java @@ -133,10 +133,14 @@ void after() throws IOException { } protected static Table createTable() { - return createTable(2); + return createTable(2, FileFormat.PARQUET); } protected static Table createTable(int formatVersion) { + return createPartitionedTable(formatVersion, FileFormat.PARQUET); + } + + protected static Table createTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -145,6 +149,8 @@ protected static Table createTable(int formatVersion) { PartitionSpec.unpartitioned(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), TableProperties.FORMAT_VERSION, String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -182,7 +188,7 @@ protected static Table createTableWithDelete(int formatVersion) { "format-version", String.valueOf(formatVersion), "write.upsert.enabled", "true")); } - protected static Table createPartitionedTable(int formatVersion) { + protected static Table createPartitionedTable(int formatVersion, FileFormat fileFormat) { return CATALOG_EXTENSION .catalog() .createTable( @@ -191,6 +197,8 @@ protected static Table createPartitionedTable(int formatVersion) { PartitionSpec.builderFor(SimpleDataUtil.SCHEMA).identity("data").build(), null, ImmutableMap.of( + "write.format.default", + fileFormat.name(), "format-version", String.valueOf(formatVersion), "flink.max-continuous-empty-commits", @@ -198,17 +206,27 @@ protected static Table createPartitionedTable(int formatVersion) { } protected static Table createPartitionedTable() { - return createPartitionedTable(2); + return createPartitionedTable(2, FileFormat.PARQUET); } protected void insert(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insert(table, id, data, FileFormat.PARQUET); + } + + protected void insert(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable(Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); } protected void insert(Table table, List records) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir).appendToTable(records); + insert(table, records, FileFormat.PARQUET); + } + + protected void insert(Table table, List records, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir).appendToTable(records); table.refresh(); } @@ -309,7 +327,12 @@ protected void update(Table table, Integer id, String oldData, String tempData, } protected void insertPartitioned(Table table, Integer id, String data) throws IOException { - new GenericAppenderHelper(table, FileFormat.PARQUET, warehouseDir) + insertPartitioned(table, id, data, FileFormat.PARQUET); + } + + protected void insertPartitioned(Table table, Integer id, String data, FileFormat fileFormat) + throws IOException { + new GenericAppenderHelper(table, fileFormat, warehouseDir) .appendToTable( TestHelpers.Row.of(data), Lists.newArrayList(SimpleDataUtil.createRecord(id, data))); table.refresh(); From e7a5a87f26f9de5b200254155aa037368b13a29c Mon Sep 17 00:00:00 2001 From: Yuya Ebihara Date: Sat, 9 May 2026 03:39:59 +0900 Subject: [PATCH 186/197] Azure: Avoid depending on KeyWrapAlgorithm in AzureProperties (#16186) * Azure: Avoid depending on KeyWrapAlgorithm in AzureProperties * fixup! Azure: Avoid depending on KeyWrapAlgorithm in AzureProperties --- .../org/apache/iceberg/azure/AzureProperties.java | 11 ++++++----- .../azure/keymanagement/AzureKeyManagementClient.java | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java b/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java index 73e99e029221..383bec30111b 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java +++ b/azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java @@ -21,7 +21,6 @@ import com.azure.core.credential.AccessToken; import com.azure.core.credential.TokenCredential; import com.azure.core.credential.TokenRequestContext; -import com.azure.security.keyvault.keys.cryptography.models.KeyWrapAlgorithm; import com.azure.storage.common.StorageSharedKeyCredential; import com.azure.storage.file.datalake.DataLakeFileSystemClientBuilder; import java.io.Serializable; @@ -53,6 +52,9 @@ public class AzureProperties implements Serializable { public static final String AZURE_KEYVAULT_KEY_WRAP_ALGORITHM = "azure.keyvault.key-wrap-algorithm"; + // Must match KeyWrapAlgorithm.RSA_OAEP_256.getValue() from azure-security-keyvault-keys + private static final String DEFAULT_KEY_WRAP_ALGORITHM = "RSA-OAEP-256"; + /** * Configure the ADLS token credential provider used to get {@link TokenCredential}. A fully * qualified concrete class with package that implements the {@link AdlsTokenCredentialProvider} @@ -136,8 +138,7 @@ public AzureProperties(Map properties) { this.keyWrapAlgorithm = properties.getOrDefault( - AzureProperties.AZURE_KEYVAULT_KEY_WRAP_ALGORITHM, - KeyWrapAlgorithm.RSA_OAEP_256.getValue()); + AzureProperties.AZURE_KEYVAULT_KEY_WRAP_ALGORITHM, DEFAULT_KEY_WRAP_ALGORITHM); } public Optional adlsReadBlockSize() { @@ -204,8 +205,8 @@ public Mono getToken(TokenRequestContext request) { } } - public KeyWrapAlgorithm keyWrapAlgorithm() { - return KeyWrapAlgorithm.fromString(this.keyWrapAlgorithm); + public String keyWrapAlgorithm() { + return this.keyWrapAlgorithm; } public Optional keyVaultUrl() { diff --git a/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java b/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java index 66bf0678bce9..498c432212c5 100644 --- a/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java +++ b/azure/src/main/java/org/apache/iceberg/azure/keymanagement/AzureKeyManagementClient.java @@ -80,7 +80,8 @@ private ClientState state() { keyClientBuilder .credential(AdlsTokenCredentialProviders.from(allProperties).credential()) .buildClient(); - KeyWrapAlgorithm keyWrapAlgorithm = azureProperties.keyWrapAlgorithm(); + KeyWrapAlgorithm keyWrapAlgorithm = + KeyWrapAlgorithm.fromString(azureProperties.keyWrapAlgorithm()); state = new ClientState(keyClient, keyWrapAlgorithm); } } From fce25041755c53e85ecaab982556b18d7001818c Mon Sep 17 00:00:00 2001 From: Manu Zhang Date: Sun, 10 May 2026 01:15:11 +0800 Subject: [PATCH 187/197] CI: Add PR title check workflow (#16101) --- .github/workflows/pr-title-check.yml | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 .github/workflows/pr-title-check.yml diff --git a/.github/workflows/pr-title-check.yml b/.github/workflows/pr-title-check.yml new file mode 100644 index 000000000000..48c4b652ce2b --- /dev/null +++ b/.github/workflows/pr-title-check.yml @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +name: PR Title Check + +on: + pull_request: + types: [opened, edited, reopened] + +concurrency: + group: pr-title-${{ github.event.pull_request.number }} + cancel-in-progress: true + +permissions: {} + +jobs: + check-pr-title: + runs-on: ubuntu-slim + steps: + - name: Check PR Title + env: + PR_TITLE: ${{ github.event.pull_request.title }} + run: | + PATTERN='^[A-Za-z][A-Za-z0-9._+/&-]*: .+' + if ! echo "$PR_TITLE" | grep -Eq "$PATTERN"; then + echo "::error::PR title must follow 'Module: Description' format. Got: '$PR_TITLE'" + echo "Examples: 'Core: Fix ...', 'Spark: Add ...', 'API: Remove ...', 'Docs: Update ...'" + exit 1 + fi + + echo "PR title is valid: '$PR_TITLE'" From 1a8fa1e56bf81ed1780817bd63cdccfe1a71281f Mon Sep 17 00:00:00 2001 From: Rexwell Minnis Date: Sat, 9 May 2026 13:19:34 -0400 Subject: [PATCH 188/197] Docs: Document CATALOG_* env vars in iceberg-rest-fixture README (#16007) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The REST fixture supports configuration via CATALOG_* environment variables through the standard prefix translation (CATALOG_ stripped, single _ → ., double __ → -, lowercased). Without docs, users discover this only by reading source. This adds a Configuration section that: - Spells out the CATALOG_* convention with a small mapping table - Shows the working form to override the catalog name (CATALOG_CATALOG_NAME=mycatalog) - Notes the in-memory SQLite default when catalog-impl + uri are unset Docs-only — no code change. Refs #14972 (closed). --- docker/iceberg-rest-fixture/README.md | 33 +++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/docker/iceberg-rest-fixture/README.md b/docker/iceberg-rest-fixture/README.md index 3805cc2468cb..5e02a2b4712a 100644 --- a/docker/iceberg-rest-fixture/README.md +++ b/docker/iceberg-rest-fixture/README.md @@ -23,6 +23,39 @@ For converting different catalog implementations into a rest one. Adapter for wrapping the existing catalog backends over REST. +## Configuration + +All configuration is provided via environment variables. + +### Backend catalog properties + +Catalog properties can be set via `CATALOG_*` environment variables. The +`CATALOG_` prefix is stripped; single underscores become dots (`.`); double +underscores become dashes (`-`). Names are lowercased. + +| Env var | Catalog property | +|---|---| +| `CATALOG_CATALOG_NAME` | `catalog.name` | +| `CATALOG_WAREHOUSE` | `warehouse` | +| `CATALOG_URI` | `uri` | +| `CATALOG_CATALOG__IMPL` | `catalog-impl` | +| `CATALOG_IO__IMPL` | `io-impl` | +| `CATALOG_JDBC_USER` | `jdbc.user` | + +If `catalog-impl` and `uri` are unset, the fixture defaults to an in-memory +SQLite `JdbcCatalog`. + +### Catalog name + +By default, the fixture serves a catalog named `rest_backend`. To match a +name expected by a specific engine (for example, a catalog created via Trino +or PyIceberg), override the `catalog.name` property: + +```bash +docker run -e CATALOG_CATALOG_NAME=mycatalog -p 8181:8181 apache/iceberg-rest-fixture +``` + + ## Build the Docker Image When making changes to the local files and test them out, you can build the image locally: From c1477969731e9c65b0ee3a57176f3baa9ff5c61f Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Sat, 9 May 2026 10:25:03 -0700 Subject: [PATCH 189/197] Docs: Update Oracle vendor description (#16261) --- site/docs/vendors.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/docs/vendors.md b/site/docs/vendors.md index c1ca0e7eb23d..8ea8021a4d13 100644 --- a/site/docs/vendors.md +++ b/site/docs/vendors.md @@ -143,7 +143,7 @@ IOMETE is a fully-managed ready to use, batteries included Data Platform. IOMETE ### [Oracle](https://oracle.com/) -Oracle [Autonomous AI Lakehouse](https://www.oracle.com/autonomous-database/autonomous-ai-lakehouse/) combines the openness of Apache Iceberg with the performance, automation, and security of Oracle Autonomous Database and Exadata. Available across Oracle Cloud Infrastructure (OCI), Microsoft Azure, Google Cloud, and AWS, Oracle provides a multicloud, open lakehouse architecture with high-performance access to Iceberg tables through integration with existing catalogs and support for the Apache Iceberg REST Catalog specification. Oracle enables interoperability across engines such as Apache Spark, Trino, and Apache Flink while minimizing data movement and preserving vendor independence. Built-in AI, vector search, graph analytics, and JSON-relational capabilities allow organizations to run advanced analytics and AI workloads directly on Iceberg data with enterprise-grade governance, availability, and serverless scalability. +As a fully-managed Oracle AI Database service, Oracle [Autonomous AI Lakehouse](https://www.oracle.com/autonomous-database/autonomous-ai-lakehouse/) combines the openness of Apache Iceberg with the performance, automation, and security of Oracle Autonomous Database and Oracle Exadata. Available across Oracle Cloud Infrastructure (OCI), Microsoft Azure, Google Cloud, AWS, and on-premises, Oracle AI Database provides a multicloud and hybrid open lakehouse architecture with high-performance access to Iceberg tables through integration with existing catalogs and support for the Apache Iceberg REST Catalog specification. Oracle enables interoperability across engines such as Apache Spark, Trino, and Apache Flink while minimizing data movement and preserving vendor independence. Built-in AI, vector search, graph analytics, and JSON-relational capabilities allow organizations to run advanced analytics and AI workloads directly on Iceberg data with enterprise-grade governance, availability, and serverless scalability. ### [PuppyGraph](https://puppygraph.com) From 1edde6c93652d71f997b970ac925ba0723db40fc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 9 May 2026 23:56:16 -0700 Subject: [PATCH 190/197] Build: Bump jackson-bom from 2.21.2 to 2.21.3 (#16269) Bumps `jackson-bom` from 2.21.2 to 2.21.3. Updates `com.fasterxml.jackson:jackson-bom` from 2.21.2 to 2.21.3 - [Commits](https://github.com/FasterXML/jackson-bom/compare/jackson-bom-2.21.2...jackson-bom-2.21.3) Updates `com.fasterxml.jackson.core:jackson-core` from 2.21.2 to 2.21.3 - [Commits](https://github.com/FasterXML/jackson-core/compare/jackson-core-2.21.2...jackson-core-2.21.3) Updates `com.fasterxml.jackson.core:jackson-databind` from 2.21.2 to 2.21.3 - [Commits](https://github.com/FasterXML/jackson/commits) --- updated-dependencies: - dependency-name: com.fasterxml.jackson:jackson-bom dependency-version: 2.21.3 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: com.fasterxml.jackson.core:jackson-core dependency-version: 2.21.3 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: com.fasterxml.jackson.core:jackson-databind dependency-version: 2.21.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c43f805fd1d7..c384a78a2a47 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -59,7 +59,7 @@ httpcomponents-httpclient5 = "5.6.1" hive2 = { strictly = "2.3.10"} # see rich version usage explanation above immutables-value = "2.12.1" jackson-annotations = "2.21" -jackson-bom = "2.21.2" +jackson-bom = "2.21.3" jackson214 = { strictly = "2.14.2"} jackson215 = { strictly = "2.15.2"} # see rich version usage explanation above jakarta-el-api = "3.0.3" From d9b6f00a98dbe9318c390951c64d70ba076c0f96 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 9 May 2026 23:56:37 -0700 Subject: [PATCH 191/197] Build: Bump joda-time:joda-time from 2.5 to 2.14.2 (#16270) Bumps [joda-time:joda-time](https://github.com/JodaOrg/joda-time) from 2.5 to 2.14.2. - [Release notes](https://github.com/JodaOrg/joda-time/releases) - [Changelog](https://github.com/JodaOrg/joda-time/blob/main/RELEASE-NOTES.txt) - [Commits](https://github.com/JodaOrg/joda-time/compare/v2.5...v2.14.2) --- updated-dependencies: - dependency-name: joda-time:joda-time dependency-version: 2.14.2 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c384a78a2a47..97026f4fa8ac 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -67,7 +67,7 @@ jakarta-servlet-api = "6.1.0" jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" jetty = "12.1.8" -joda = "2.5" +joda = "2.14.2" junit = "5.14.3" junit-platform = "1.14.3" junit-pioneer = "2.3.0" From 70ed5d9566709450c60088bf61b4672f33d71278 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 9 May 2026 23:57:51 -0700 Subject: [PATCH 192/197] Build: Bump junit-platform from 1.14.3 to 1.14.4 (#16272) Bumps `junit-platform` from 1.14.3 to 1.14.4. Updates `org.junit.platform:junit-platform-launcher` from 1.14.3 to 1.14.4 - [Release notes](https://github.com/junit-team/junit-framework/releases) - [Commits](https://github.com/junit-team/junit-framework/commits) Updates `org.junit.platform:junit-platform-suite-api` from 1.14.3 to 1.14.4 - [Release notes](https://github.com/junit-team/junit-framework/releases) - [Commits](https://github.com/junit-team/junit-framework/commits) Updates `org.junit.platform:junit-platform-suite-engine` from 1.14.3 to 1.14.4 - [Release notes](https://github.com/junit-team/junit-framework/releases) - [Commits](https://github.com/junit-team/junit-framework/commits) --- updated-dependencies: - dependency-name: org.junit.platform:junit-platform-launcher dependency-version: 1.14.4 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.junit.platform:junit-platform-suite-api dependency-version: 1.14.4 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.junit.platform:junit-platform-suite-engine dependency-version: 1.14.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 97026f4fa8ac..2cdf18c98114 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -69,7 +69,7 @@ jaxb-runtime = "2.3.9" jetty = "12.1.8" joda = "2.14.2" junit = "5.14.3" -junit-platform = "1.14.3" +junit-platform = "1.14.4" junit-pioneer = "2.3.0" kafka = "3.9.2" kryo-shaded = "4.0.3" From 34511542e6764fdbb23e2a60caf34a898b8ad76f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 9 May 2026 23:58:25 -0700 Subject: [PATCH 193/197] Build: Bump github/codeql-action from 4.35.2 to 4.35.3 (#16275) Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.35.2 to 4.35.3. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/95e58e9a2cdfd71adc6e0353d5c52f41a045d225...e46ed2cbd01164d986452f91f178727624ae40d7) --- updated-dependencies: - dependency-name: github/codeql-action dependency-version: 4.35.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codeql.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 49212916a3f2..98685f3ced21 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 + uses: github/codeql-action/init@e46ed2cbd01164d986452f91f178727624ae40d7 # v4.35.3 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 + uses: github/codeql-action/analyze@e46ed2cbd01164d986452f91f178727624ae40d7 # v4.35.3 with: category: "/language:actions" From a6a0b8131691b089dd5ff9909d7f871bc8f3a996 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 10 May 2026 11:20:09 -0700 Subject: [PATCH 194/197] Build: Bump junit from 5.14.3 to 5.14.4 (#16271) Bumps `junit` from 5.14.3 to 5.14.4. Updates `org.junit.jupiter:junit-jupiter` from 5.14.3 to 5.14.4 - [Release notes](https://github.com/junit-team/junit-framework/releases) - [Commits](https://github.com/junit-team/junit-framework/compare/r5.14.3...r5.14.4) Updates `org.junit.jupiter:junit-jupiter-engine` from 5.14.3 to 5.14.4 - [Release notes](https://github.com/junit-team/junit-framework/releases) - [Commits](https://github.com/junit-team/junit-framework/compare/r5.14.3...r5.14.4) --- updated-dependencies: - dependency-name: org.junit.jupiter:junit-jupiter dependency-version: 5.14.4 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.junit.jupiter:junit-jupiter-engine dependency-version: 5.14.4 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- gradle/libs.versions.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 2cdf18c98114..41b620164a80 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -68,7 +68,7 @@ jaxb-api = "2.3.1" jaxb-runtime = "2.3.9" jetty = "12.1.8" joda = "2.14.2" -junit = "5.14.3" +junit = "5.14.4" junit-platform = "1.14.4" junit-pioneer = "2.3.0" kafka = "3.9.2" From 68bab74c771e560fe775cca62a90c1e74a2260b1 Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Sun, 10 May 2026 20:27:50 -0700 Subject: [PATCH 195/197] Build: Bump io.grpc:grpc-netty-shaded from 1.80.0 to 1.81.0 (#16277) Co-authored-by: Cursor --- kafka-connect/build.gradle | 2 +- kafka-connect/kafka-connect-runtime/runtime-deps.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kafka-connect/build.gradle b/kafka-connect/build.gradle index 009ae719bac5..43eb245d93a3 100644 --- a/kafka-connect/build.gradle +++ b/kafka-connect/build.gradle @@ -81,7 +81,7 @@ project(':iceberg-kafka-connect:iceberg-kafka-connect-runtime') { force 'org.apache.hadoop.thirdparty:hadoop-shaded-guava:1.5.0' force 'com.fasterxml.woodstox:woodstox-core:6.7.0' force 'commons-beanutils:commons-beanutils:1.11.0' - force 'io.grpc:grpc-netty-shaded:1.80.0' + force 'io.grpc:grpc-netty-shaded:1.81.0' } } } diff --git a/kafka-connect/kafka-connect-runtime/runtime-deps.txt b/kafka-connect/kafka-connect-runtime/runtime-deps.txt index 56a880cb6494..3e9819af10ce 100644 --- a/kafka-connect/kafka-connect-runtime/runtime-deps.txt +++ b/kafka-connect/kafka-connect-runtime/runtime-deps.txt @@ -90,7 +90,7 @@ io.grpc:grpc-core:1.80.0 io.grpc:grpc-googleapis:1.80.0 io.grpc:grpc-grpclb:1.80.0 io.grpc:grpc-inprocess:1.80.0 -io.grpc:grpc-netty-shaded:1.80.0 +io.grpc:grpc-netty-shaded:1.81.0 io.grpc:grpc-opentelemetry:1.80.0 io.grpc:grpc-protobuf-lite:1.80.0 io.grpc:grpc-protobuf:1.80.0 From 6364aaae20c50b8bfeb550ec89b3b14feedd51b0 Mon Sep 17 00:00:00 2001 From: GuoYu <511955993@qq.com> Date: Mon, 11 May 2026 18:36:36 +0800 Subject: [PATCH 196/197] Data: Add TCK tests for Schema Evolution in BaseFormatModelTests (#15843) --- build.gradle | 9 + .../apache/iceberg/avro/AvroTestHelpers.java | 8 + .../iceberg/data/BaseFormatModelTests.java | 450 ++++++++++++++++++ flink/v1.20/build.gradle | 2 + flink/v2.0/build.gradle | 2 + flink/v2.1/build.gradle | 2 + .../iceberg/orc/OrcWritingTestUtils.java | 35 ++ .../apache/iceberg/orc/TestORCSchemaUtil.java | 8 + .../parquet/ParquetWritingTestUtils.java | 2 +- .../iceberg/parquet/ParquetFileTestUtils.java | 36 ++ spark/v3.4/build.gradle | 2 + spark/v3.5/build.gradle | 1 + spark/v4.0/build.gradle | 1 + spark/v4.1/build.gradle | 1 + 14 files changed, 558 insertions(+), 1 deletion(-) create mode 100644 orc/src/test/java/org/apache/iceberg/orc/OrcWritingTestUtils.java create mode 100644 parquet/src/testFixtures/java/org/apache/iceberg/parquet/ParquetFileTestUtils.java diff --git a/build.gradle b/build.gradle index 261dfabf0412..fca32be9dc66 100644 --- a/build.gradle +++ b/build.gradle @@ -457,6 +457,8 @@ project(':iceberg-data') { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) } test { @@ -939,6 +941,13 @@ project(':iceberg-parquet') { exclude group: 'org.apache.avro', module: 'avro' } + testFixturesApi(libs.parquet.hadoop) { + exclude group: 'org.apache.avro', module: 'avro' + // already shaded by Parquet + exclude group: 'it.unimi.dsi' + exclude group: 'org.codehaus.jackson' + } + testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') } diff --git a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java index 0a1cf43f4fb5..fd73706ce082 100644 --- a/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java +++ b/core/src/test/java/org/apache/iceberg/avro/AvroTestHelpers.java @@ -177,4 +177,12 @@ public static String readAvroCodec(File file) throws IOException { return reader.getMetaString("avro.codec"); } } + + public static boolean hasIds(Schema schema) { + return AvroSchemaUtil.hasIds(schema); + } + + public static Schema removeIds(org.apache.iceberg.Schema schema) { + return RemoveIds.removeIds(schema); + } } diff --git a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java index d0b8e3161bdf..a38b025e0f05 100644 --- a/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java +++ b/data/src/test/java/org/apache/iceberg/data/BaseFormatModelTests.java @@ -28,6 +28,7 @@ import java.io.File; import java.io.IOException; +import java.io.OutputStream; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.Comparator; @@ -38,6 +39,14 @@ import java.util.function.BiFunction; import java.util.function.Function; import java.util.stream.IntStream; +import org.apache.avro.file.DataFileStream; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumReader; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.io.DatumWriter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; import org.apache.iceberg.ContentFile; import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; @@ -52,6 +61,8 @@ import org.apache.iceberg.StructLike; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TestTables; +import org.apache.iceberg.avro.AvroTestHelpers; +import org.apache.iceberg.data.orc.GenericOrcWriter; import org.apache.iceberg.deletes.EqualityDeleteWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.deletes.PositionDeleteWriter; @@ -70,6 +81,15 @@ import org.apache.iceberg.io.DataWriter; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.mapping.MappingUtil; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.orc.ORCSchemaUtil; +import org.apache.iceberg.orc.OrcRowWriter; +import org.apache.iceberg.orc.OrcWritingTestUtils; +import org.apache.iceberg.orc.TestORCSchemaUtil; +import org.apache.iceberg.parquet.ParquetFileTestUtils; +import org.apache.iceberg.parquet.ParquetSchemaUtil; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.iceberg.relocated.com.google.common.collect.Lists; @@ -77,6 +97,14 @@ import org.apache.iceberg.types.Conversions; import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; +import org.apache.orc.OrcFile; +import org.apache.orc.Reader; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.io.TempDir; @@ -1330,6 +1358,280 @@ private void readAndAssertGenericRecords( readAndAssertGenericRecords(fileFormat, schema, sourceRecords.stream().map(transform).toList()); } + /** + * Schema evolution: Adding column (reading with wider schema). Write with DefaultSchema, read + * with additional optional columns. The new columns should be filled with null values. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionAddColumn(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + List evolvedColumns = Lists.newArrayList(writeSchema.columns()); + + int maxFieldId = + writeSchema.columns().stream().mapToInt(Types.NestedField::fieldId).max().orElse(0); + evolvedColumns.add( + Types.NestedField.optional("new_string_col") + .withId(maxFieldId + 1) + .ofType(Types.StringType.get()) + .build()); + evolvedColumns.add( + Types.NestedField.optional("new_int_col") + .withId(maxFieldId + 2) + .ofType(Types.IntegerType.get()) + .build()); + Schema readSchema = new Schema(evolvedColumns); + readAndAssertEngineRecords( + fileFormat, + readSchema, + genericRecords, + record -> { + Record expected = copy(record, writeSchema, readSchema); + + expected.setField("new_string_col", null); + expected.setField("new_int_col", null); + return expected; + }); + } + + /** + * Schema evolution: Projection / Removing column (reading with narrower schema). Write with + * DefaultSchema, read with only a subset of columns (skipping middle columns). + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionProjection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + List writeColumns = writeSchema.columns(); + assumeThat(writeColumns).hasSizeGreaterThanOrEqualTo(2); + Schema projectedSchema = + new Schema(writeColumns.get(0), writeColumns.get(writeColumns.size() - 1)); + + readAndAssertEngineRecords( + fileFormat, + projectedSchema, + genericRecords, + record -> copy(record, projectedSchema, projectedSchema)); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionDropAndReAddSameNameColumn(FileFormat fileFormat) throws IOException { + + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + // Remove col_b and add a new col_b with a different field ID + Schema readSchema = + new Schema( + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.optional(6, "col_b", Types.IntegerType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.required(4, "col_d", Types.FloatType.get()), + Types.NestedField.required(5, "col_e", Types.DoubleType.get())); + + readAndAssertEngineRecords( + fileFormat, + readSchema, + genericRecords, + record -> { + Record expected = GenericRecord.create(readSchema); + expected.setField("col_a", record.getField("col_a")); + expected.setField("col_b", null); + expected.setField("col_c", record.getField("col_c")); + expected.setField("col_d", record.getField("col_d")); + expected.setField("col_e", record.getField("col_e")); + return expected; + }); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionTypePromotionIntToLong(FileFormat fileFormat) throws IOException { + runTypePromotionCheck( + fileFormat, + Types.IntegerType.get(), + Types.LongType.get(), + value -> value == null ? null : ((Integer) value).longValue()); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionTypePromotionFloatToDouble(FileFormat fileFormat) throws IOException { + runTypePromotionCheck( + fileFormat, + Types.FloatType.get(), + Types.DoubleType.get(), + value -> value == null ? null : ((Float) value).doubleValue()); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionTypePromotionDecimalPrecision(FileFormat fileFormat) throws IOException { + runTypePromotionCheck( + fileFormat, Types.DecimalType.of(9, 2), Types.DecimalType.of(18, 2), Function.identity()); + } + + /** + * Schema evolution: Reorder columns. Write with DefaultSchema {col_a, col_b, col_c, col_d, + * col_e}, read with reordered schema {col_e, col_c, col_a, col_d, col_b}. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionReorderColumns(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + Schema reorderedSchema = + new Schema( + Types.NestedField.required(5, "col_e", Types.DoubleType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.required(4, "col_d", Types.FloatType.get()), + Types.NestedField.required(2, "col_b", Types.IntegerType.get())); + + readAndAssertEngineRecords( + fileFormat, + reorderedSchema, + genericRecords, + record -> copy(record, reorderedSchema, reorderedSchema)); + } + + /** + * Schema evolution: Rename column. Write with DefaultSchema where col_b has field ID 2. Read with + * a schema where the same field ID 2 is renamed to "column_b". Since Iceberg binds by field ID, + * the renamed column should still read the original data correctly. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionRenameColumn(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + // rename col_b(id=2) -> column_b, col_d(id=4) -> column_d + Schema renamedSchema = + new Schema( + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.required(2, "column_b", Types.IntegerType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.required(4, "column_d", Types.FloatType.get()), + Types.NestedField.required(5, "col_e", Types.DoubleType.get())); + + readAndAssertEngineRecords( + fileFormat, + renamedSchema, + genericRecords, + record -> { + Record expected = GenericRecord.create(renamedSchema); + expected.setField("col_a", record.getField("col_a")); + expected.setField("column_b", record.getField("col_b")); + expected.setField("col_c", record.getField("col_c")); + expected.setField("column_d", record.getField("col_d")); + expected.setField("col_e", record.getField("col_e")); + return expected; + }); + } + + /** + * Schema evolution: Required → Optional. Write with DefaultSchema where all columns are required. + * Read with a schema where some columns are changed to optional. Iceberg allows widening required + * to optional. The data should still be read correctly. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionRequiredToOptional(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + // change col_b and col_d to optional + Schema readSchema = + new Schema( + Types.NestedField.required(1, "col_a", Types.StringType.get()), + Types.NestedField.optional(2, "col_b", Types.IntegerType.get()), + Types.NestedField.required(3, "col_c", Types.LongType.get()), + Types.NestedField.optional(4, "col_d", Types.FloatType.get()), + Types.NestedField.required(5, "col_e", Types.DoubleType.get())); + + readAndAssertEngineRecords( + fileFormat, readSchema, genericRecords, record -> copy(record, readSchema, readSchema)); + } + + /** + * Schema evolution: Read with empty projection. Write with DefaultSchema, read with an empty + * schema (no columns). The reader should return the correct number of rows but with no data + * columns. + */ + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testSchemaEvolutionEmptyProjection(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema writeSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + Schema emptySchema = new Schema(); + + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + try (CloseableIterable reader = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(emptySchema) + .build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertThat(readRecords).hasSameSizeAs(genericRecords); + } + + @ParameterizedTest + @FieldSource("FILE_FORMATS") + void testReadFileWithoutFieldIdsUsingNameMapping(FileFormat fileFormat) throws IOException { + DataGenerator dataGenerator = new DataGenerators.DefaultSchema(); + Schema icebergSchema = dataGenerator.schema(); + + List genericRecords = dataGenerator.generateRecords(); + + // Write the file WITHOUT Iceberg field IDs (as an external writer would). + writeRecordsWithoutFieldIds(fileFormat, icebergSchema, genericRecords); + + NameMapping nameMapping = MappingUtil.create(icebergSchema); + + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + try (CloseableIterable reader = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(icebergSchema) + .withNameMapping(nameMapping) + .build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertEquals(icebergSchema, convertToEngineRecords(genericRecords, icebergSchema), readRecords); + } + private void readAndAssertGenericRecords( FileFormat fileFormat, Schema schema, List expected) throws IOException { InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); @@ -1777,4 +2079,152 @@ private static Record copy(Record source, Schema sourceSchema, Schema targetSche return result; } + + private void writeRecordsWithoutFieldIds( + FileFormat fileFormat, Schema schema, List records) throws IOException { + switch (fileFormat) { + case PARQUET -> writeParquetWithoutFieldIds(schema, records); + case AVRO -> writeAvroWithoutFieldIds(schema, records); + case ORC -> writeOrcWithoutFieldIds(schema, records); + default -> throw new UnsupportedOperationException("Unsupported file format: " + fileFormat); + } + } + + private void writeAvroWithoutFieldIds(Schema schema, List records) throws IOException { + org.apache.avro.Schema avroSchemaWithoutIds = AvroTestHelpers.removeIds(schema); + + OutputFile outputFile = encryptedFile.encryptingOutputFile(); + DatumWriter datumWriter = new GenericDatumWriter<>(avroSchemaWithoutIds); + try (OutputStream out = outputFile.create(); + DataFileWriter writer = new DataFileWriter<>(datumWriter)) { + writer.create(avroSchemaWithoutIds, out); + for (Record record : records) { + GenericData.Record avroRecord = new GenericData.Record(avroSchemaWithoutIds); + for (Types.NestedField field : schema.columns()) { + avroRecord.put(field.name(), record.getField(field.name())); + } + + writer.append(avroRecord); + } + } + + try (DataFileStream reader = + new DataFileStream<>(outputFile.toInputFile().newStream(), new GenericDatumReader<>())) { + assertThat(AvroTestHelpers.hasIds(reader.getSchema())).isFalse(); + } + } + + private void writeParquetWithoutFieldIds(Schema schema, List records) throws IOException { + org.apache.avro.Schema avroSchemaWithoutIds = AvroTestHelpers.removeIds(schema); + + OutputFile outputFile = encryptedFile.encryptingOutputFile(); + + try (ParquetWriter writer = + AvroParquetWriter.builder(ParquetFileTestUtils.file(outputFile)) + .withDataModel(GenericData.get()) + .withSchema(avroSchemaWithoutIds) + .withConf(new Configuration()) + .build()) { + for (Record record : records) { + GenericData.Record avroRecord = new GenericData.Record(avroSchemaWithoutIds); + for (Types.NestedField field : schema.columns()) { + avroRecord.put(field.name(), record.getField(field.name())); + } + + writer.write(avroRecord); + } + } + + try (ParquetFileReader reader = + ParquetFileReader.open(ParquetFileTestUtils.file(outputFile.toInputFile()))) { + assertThat(ParquetSchemaUtil.hasIds(reader.getFooter().getFileMetaData().getSchema())) + .isFalse(); + } + } + + private void writeOrcWithoutFieldIds(Schema schema, List records) throws IOException { + TypeDescription typeWithIds = ORCSchemaUtil.convert(schema); + TypeDescription typeWithoutIds = TestORCSchemaUtil.removeIds(typeWithIds); + + OutputFile outputFile = encryptedFile.encryptingOutputFile(); + Path hadoopPath = new Path(outputFile.location()); + + Configuration conf = new Configuration(); + OrcFile.WriterOptions options = + OrcFile.writerOptions(conf) + .useUTCTimestamp(true) + .setSchema(typeWithoutIds) + .fileSystem(OrcWritingTestUtils.outputFileSystem(outputFile)); + + OrcRowWriter rowWriter = GenericOrcWriter.buildWriter(schema, typeWithIds); + + try (Writer orcWriter = OrcFile.createWriter(hadoopPath, options)) { + VectorizedRowBatch batch = typeWithoutIds.createRowBatch(); + for (Record record : records) { + rowWriter.write(record, batch); + if (batch.size == batch.getMaxSize()) { + orcWriter.addRowBatch(batch); + batch.reset(); + } + } + + if (batch.size > 0) { + orcWriter.addRowBatch(batch); + batch.reset(); + } + } + + InputFile inputFile = outputFile.toInputFile(); + OrcFile.ReaderOptions readerOptions = + OrcFile.readerOptions(conf) + .useUTCTimestamp(true) + .filesystem(OrcWritingTestUtils.inputFileSystem(inputFile)) + .maxLength(inputFile.getLength()); + + try (Reader reader = OrcFile.createReader(hadoopPath, readerOptions)) { + assertThat(TestORCSchemaUtil.hasIds(reader.getSchema())).isFalse(); + } + } + + private void runTypePromotionCheck( + FileFormat fileFormat, Type fromType, Type toType, Function promoteValue) + throws IOException { + String columnName = "col"; + Schema writeSchema = new Schema(Types.NestedField.required(1, columnName, fromType)); + Schema readSchema = new Schema(Types.NestedField.required(1, columnName, toType)); + + List genericRecords = RandomGenericData.generate(writeSchema, 10, 1L); + writeGenericRecords(fileFormat, writeSchema, genericRecords); + + readAndAssertEngineRecords( + fileFormat, + readSchema, + genericRecords, + record -> { + Record expected = GenericRecord.create(readSchema); + expected.setField(columnName, promoteValue.apply(record.getField(columnName))); + return expected; + }); + } + + private void readAndAssertEngineRecords( + FileFormat fileFormat, + Schema readSchema, + List sourceRecords, + Function converter) + throws IOException { + List expectedGenericRecords = sourceRecords.stream().map(converter).toList(); + InputFile inputFile = encryptedFile.encryptingOutputFile().toInputFile(); + List readRecords; + try (CloseableIterable reader = + FormatModelRegistry.readBuilder(fileFormat, engineType(), inputFile) + .project(readSchema) + .build()) { + readRecords = ImmutableList.copyOf(reader); + } + + assertThat(readRecords).hasSize(expectedGenericRecords.size()); + assertEquals( + readSchema, convertToEngineRecords(expectedGenericRecords, readSchema), readRecords); + } } diff --git a/flink/v1.20/build.gradle b/flink/v1.20/build.gradle index 467b0fa8c9be..41f2489c8038 100644 --- a/flink/v1.20/build.gradle +++ b/flink/v1.20/build.gradle @@ -84,6 +84,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) // By default, hive-exec is a fat/uber jar and it exports a guava library // that's really old. We use the core classifier to be able to override our guava diff --git a/flink/v2.0/build.gradle b/flink/v2.0/build.gradle index f80a31242112..7bc37b30e5a1 100644 --- a/flink/v2.0/build.gradle +++ b/flink/v2.0/build.gradle @@ -84,6 +84,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) // By default, hive-exec is a fat/uber jar and it exports a guava library // that's really old. We use the core classifier to be able to override our guava diff --git a/flink/v2.1/build.gradle b/flink/v2.1/build.gradle index 451f14414772..f93b61646e7c 100644 --- a/flink/v2.1/build.gradle +++ b/flink/v2.1/build.gradle @@ -84,6 +84,8 @@ project(":iceberg-flink:iceberg-flink-${flinkMajorVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') + testImplementation(testFixtures(project(':iceberg-parquet'))) // By default, hive-exec is a fat/uber jar and it exports a guava library // that's really old. We use the core classifier to be able to override our guava diff --git a/orc/src/test/java/org/apache/iceberg/orc/OrcWritingTestUtils.java b/orc/src/test/java/org/apache/iceberg/orc/OrcWritingTestUtils.java new file mode 100644 index 000000000000..72ed03ce2c80 --- /dev/null +++ b/orc/src/test/java/org/apache/iceberg/orc/OrcWritingTestUtils.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.orc; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; + +public class OrcWritingTestUtils { + private OrcWritingTestUtils() {} + + public static FileSystem outputFileSystem(OutputFile file) { + return new FileIOFSUtil.OutputFileSystem(file); + } + + public static FileSystem inputFileSystem(InputFile file) { + return new FileIOFSUtil.InputFileSystem(file); + } +} diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java b/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java index c19e36be3ac1..e331ca94a211 100644 --- a/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java +++ b/orc/src/test/java/org/apache/iceberg/orc/TestORCSchemaUtil.java @@ -560,4 +560,12 @@ private static boolean equalsWithIds(TypeDescription first, TypeDescription seco return true; } + + public static TypeDescription removeIds(TypeDescription type) { + return ORCSchemaUtil.removeIds(type); + } + + public static boolean hasIds(TypeDescription orcSchema) { + return ORCSchemaUtil.hasIds(orcSchema); + } } diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java index b8cd38f56dfe..441073d34a4e 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/ParquetWritingTestUtils.java @@ -35,7 +35,7 @@ import org.apache.parquet.schema.MessageType; /** Utilities for tests that need to write Parquet files. */ -class ParquetWritingTestUtils { +public class ParquetWritingTestUtils { private ParquetWritingTestUtils() {} diff --git a/parquet/src/testFixtures/java/org/apache/iceberg/parquet/ParquetFileTestUtils.java b/parquet/src/testFixtures/java/org/apache/iceberg/parquet/ParquetFileTestUtils.java new file mode 100644 index 000000000000..a6055424c0a6 --- /dev/null +++ b/parquet/src/testFixtures/java/org/apache/iceberg/parquet/ParquetFileTestUtils.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.OutputFile; + +/** Utilities for tests that need to write Parquet files. */ +public class ParquetFileTestUtils { + + private ParquetFileTestUtils() {} + + public static OutputFile file(org.apache.iceberg.io.OutputFile file) { + return ParquetIO.file(file); + } + + public static InputFile file(org.apache.iceberg.io.InputFile file) { + return ParquetIO.file(file); + } +} diff --git a/spark/v3.4/build.gradle b/spark/v3.4/build.gradle index ead4a32f49b0..57e485317762 100644 --- a/spark/v3.4/build.gradle +++ b/spark/v3.4/build.gradle @@ -105,8 +105,10 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility + testImplementation(testFixtures(project(':iceberg-parquet'))) } test { diff --git a/spark/v3.5/build.gradle b/spark/v3.5/build.gradle index a69b78e5ad8e..68bdb1c21a98 100644 --- a/spark/v3.5/build.gradle +++ b/spark/v3.5/build.gradle @@ -105,6 +105,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) diff --git a/spark/v4.0/build.gradle b/spark/v4.0/build.gradle index ba2e0fd4bae2..3707e01e4865 100644 --- a/spark/v4.0/build.gradle +++ b/spark/v4.0/build.gradle @@ -105,6 +105,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) diff --git a/spark/v4.1/build.gradle b/spark/v4.1/build.gradle index 02e4323e709e..e6455fa34f88 100644 --- a/spark/v4.1/build.gradle +++ b/spark/v4.1/build.gradle @@ -105,6 +105,7 @@ project(":iceberg-spark:iceberg-spark-${sparkMajorVersion}_${scalaVersion}") { testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts') testImplementation project(path: ':iceberg-data', configuration: 'testArtifacts') + testImplementation project(path: ':iceberg-orc', configuration: 'testArtifacts') testImplementation (project(path: ':iceberg-open-api', configuration: 'testFixturesRuntimeElements')) testImplementation libs.awaitility testImplementation(testFixtures(project(':iceberg-parquet'))) From 0e0e795197a268ce651f86d2573b5d70d5facd1b Mon Sep 17 00:00:00 2001 From: Huaxin Gao Date: Mon, 11 May 2026 04:57:38 -0700 Subject: [PATCH 197/197] Build: Bump org.openapitools:openapi-generator-gradle-plugin from 7.21.0 to 7.22.0 (#16278) Co-authored-by: Cursor --- build.gradle | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build.gradle b/build.gradle index fca32be9dc66..048de63ae76c 100644 --- a/build.gradle +++ b/build.gradle @@ -36,7 +36,7 @@ buildscript { classpath 'org.revapi:gradle-revapi:1.8.0' classpath 'com.gorylenko.gradle-git-properties:gradle-git-properties:2.5.7' classpath 'com.palantir.gradle.gitversion:gradle-git-version:4.3.0' - classpath 'org.openapitools:openapi-generator-gradle-plugin:7.21.0' + classpath 'org.openapitools:openapi-generator-gradle-plugin:7.22.0' } } @@ -583,7 +583,7 @@ project(':iceberg-aws') { } // TODO delete once s3-signer-open-api.yaml is removed - def s3SignerSpec = "$projectDir/src/main/resources/s3-signer-open-api.yaml" + def s3SignerSpec = layout.projectDirectory.file("src/main/resources/s3-signer-open-api.yaml") tasks.register('validateS3SignerSpec', org.openapitools.generator.gradle.plugin.tasks.ValidateTask) { inputSpec.set(s3SignerSpec) recommend.set(true) @@ -1149,7 +1149,7 @@ project(':iceberg-open-api') { .collectEntries { k, v -> { [(k):v, (k.replaceFirst("rck.", "")):v] }} // strip prefix } - def restCatalogSpec = "$projectDir/rest-catalog-open-api.yaml" + def restCatalogSpec = layout.projectDirectory.file("rest-catalog-open-api.yaml") tasks.register('validateRESTCatalogSpec', org.openapitools.generator.gradle.plugin.tasks.ValidateTask) { inputSpec.set(restCatalogSpec) recommend.set(true)