From 48666375ebfa00cfa12d129af5d1c95eb40b4ff3 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 8 Jun 2026 14:50:57 +0100 Subject: [PATCH 1/9] Add support for position deletes in Vortex Pushes position deletes into the Vortex scan so deleted rows are excluded natively instead of being read and filtered out afterwards. DeleteFilter exposes the deleted positions for pushdown (skipped when the _is_deleted column is projected, since those rows must be marked rather than removed). GenericReader forwards them only when the reader advertises support via the new ReadBuilder.supportsPositionDeletes(), so Parquet/ORC/Avro keep applying deletes post-scan. VortexIterable serializes the positions as a portable 64-bit Roaring bitmap and applies EXCLUDE_ROARING row selection. Also adds a Vortex position-delete writer (PositionDeleteVortexWriter, VortexFormatModel.forPositionDeletes) for writing path/pos delete files, plus TestVortexPositionDeletes. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../apache/iceberg/formats/ReadBuilder.java | 26 +++ .../org/apache/iceberg/data/DeleteFilter.java | 28 +++- .../apache/iceberg/data/GenericReader.java | 17 +- .../vortex/PositionDeleteVortexWriter.java | 50 ++++++ .../iceberg/vortex/VortexFormatModel.java | 94 ++++++++++- .../iceberg/vortex/VortexFormatModels.java | 2 + .../apache/iceberg/vortex/VortexIterable.java | 12 ++ .../vortex/TestVortexPositionDeletes.java | 155 ++++++++++++++++++ 8 files changed, 377 insertions(+), 7 deletions(-) create mode 100644 vortex/src/main/java/org/apache/iceberg/data/vortex/PositionDeleteVortexWriter.java create mode 100644 vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java diff --git a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java index 2809750970a7..f506ed433e39 100644 --- a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java @@ -20,6 +20,7 @@ import java.util.Map; import org.apache.iceberg.Schema; +import org.apache.iceberg.deletes.PositionDeleteIndex; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.mapping.NameMapping; @@ -119,6 +120,31 @@ default ReadBuilder setAll(Map properties) { /** Sets a mapping from external schema names to Iceberg type IDs. */ ReadBuilder withNameMapping(NameMapping nameMapping); + /** + * Whether this reader applies position deletes supplied through {@link + * #positionDeletes(PositionDeleteIndex)} during the scan. Callers must check this before relying + * on pushdown: a reader that returns {@code false} ignores pushed deletes, so the deletes still + * have to be applied after reading. + * + * @return true if pushed position deletes are honored by the scan + */ + default boolean supportsPositionDeletes() { + return false; + } + + /** + * Pushes position deletes into the reader so that deleted rows are excluded during scanning, + * rather than being read and filtered out afterwards. Positions in the index are relative to the + * start of the file. Only meaningful when {@link #supportsPositionDeletes()} returns true; other + * readers ignore the index and rely on post-scan filtering. + * + * @param deletes the deleted row positions for the file being read + * @return this for method chaining + */ + default ReadBuilder positionDeletes(PositionDeleteIndex deletes) { + return this; + } + /** Builds the reader. */ CloseableIterable build(); } diff --git a/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java b/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java index 69236cee8f9b..96911eb920b2 100644 --- a/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java +++ b/data/src/main/java/org/apache/iceberg/data/DeleteFilter.java @@ -21,6 +21,7 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.function.Function; import java.util.function.Predicate; @@ -65,6 +66,7 @@ public abstract class DeleteFilter { private PositionDeleteIndex deleteRowPositions = null; private List> isInDeleteSets = null; private Predicate eqDeleteRows = null; + private boolean posDeletesPushedDown = false; protected DeleteFilter( String filePath, @@ -258,8 +260,32 @@ public PositionDeleteIndex deletedRowPositions() { return deleteRowPositions; } + /** + * Returns the deleted row positions for native pushdown into a format scanner so that deleted + * rows are excluded during the scan instead of being read and filtered out afterwards. When a + * non-empty index is returned it is marked as handled, and {@link #filter(CloseableIterable)} + * will not re-apply position deletes. + * + *

Returns empty when there are no position deletes, or when the {@code _is_deleted} metadata + * column is projected: in that case deleted rows must be retained and marked rather than removed, + * so they cannot be dropped at the scan level. + */ + public Optional pushablePosDeletes() { + if (posDeletes.isEmpty() || hasIsDeletedColumn) { + return Optional.empty(); + } + + PositionDeleteIndex positions = deletedRowPositions(); + if (positions == null || positions.isEmpty()) { + return Optional.empty(); + } + + this.posDeletesPushedDown = true; + return Optional.of(positions); + } + private CloseableIterable applyPosDeletes(CloseableIterable records) { - if (posDeletes.isEmpty()) { + if (posDeletes.isEmpty() || posDeletesPushedDown) { return records; } diff --git a/data/src/main/java/org/apache/iceberg/data/GenericReader.java b/data/src/main/java/org/apache/iceberg/data/GenericReader.java index f18f5785105f..778861487473 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericReader.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericReader.java @@ -20,10 +20,12 @@ import java.io.Serializable; import java.util.Map; +import java.util.Optional; import org.apache.iceberg.CombinedScanTask; import org.apache.iceberg.FileScanTask; import org.apache.iceberg.Schema; import org.apache.iceberg.TableScan; +import org.apache.iceberg.deletes.PositionDeleteIndex; import org.apache.iceberg.expressions.Evaluator; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; @@ -66,7 +68,7 @@ public CloseableIterable open(FileScanTask task) { DeleteFilter deletes = new GenericDeleteFilter(io, task, tableSchema, projection); Schema readSchema = deletes.requiredSchema(); - CloseableIterable records = openFile(task, readSchema); + CloseableIterable records = openFile(task, readSchema, deletes); records = deletes.filter(records); records = applyResidual(records, readSchema, task.residual()); @@ -84,7 +86,8 @@ private CloseableIterable applyResidual( return records; } - private CloseableIterable openFile(FileScanTask task, Schema fileProjection) { + private CloseableIterable openFile( + FileScanTask task, Schema fileProjection, DeleteFilter deletes) { InputFile input = io.newInputFile(task.file()); Map partition = PartitionUtil.constantsMap(task, IdentityPartitionConverters::convertConstant); @@ -95,6 +98,16 @@ private CloseableIterable openFile(FileScanTask task, Schema fileProject builder = builder.reuseContainers(); } + // Push position deletes into the scan when the reader applies them natively; the delete filter + // then skips re-applying them. Readers without pushdown support fall back to post-scan + // filtering. + if (builder.supportsPositionDeletes()) { + Optional pushable = deletes.pushablePosDeletes(); + if (pushable.isPresent()) { + builder = builder.positionDeletes(pushable.get()); + } + } + return builder .project(fileProjection) .idToConstant(partition) diff --git a/vortex/src/main/java/org/apache/iceberg/data/vortex/PositionDeleteVortexWriter.java b/vortex/src/main/java/org/apache/iceberg/data/vortex/PositionDeleteVortexWriter.java new file mode 100644 index 000000000000..994214d76b3c --- /dev/null +++ b/vortex/src/main/java/org/apache/iceberg/data/vortex/PositionDeleteVortexWriter.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data.vortex; + +import java.nio.charset.StandardCharsets; +import java.util.stream.Stream; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.iceberg.FieldMetrics; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.vortex.VortexValueWriter; + +/** + * Writes {@link PositionDelete} objects to Arrow vectors for Vortex position delete file output. + * + *

The output schema is [file_path: string, pos: long]. + */ +public class PositionDeleteVortexWriter implements VortexValueWriter> { + @Override + public void write(PositionDelete datum, VectorSchemaRoot root, int rowIndex) { + VarCharVector pathVector = (VarCharVector) root.getVector(0); + byte[] pathBytes = datum.path().toString().getBytes(StandardCharsets.UTF_8); + pathVector.setSafe(rowIndex, pathBytes); + + BigIntVector posVector = (BigIntVector) root.getVector(1); + posVector.setSafe(rowIndex, datum.pos()); + } + + @Override + public Stream> metrics() { + return Stream.empty(); + } +} diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java index 4379d069b06f..f4f18af6ff31 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java @@ -21,7 +21,10 @@ import dev.vortex.api.Session; import dev.vortex.api.VortexWriter; import dev.vortex.jni.NativeRuntime; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; import java.io.IOException; +import java.io.UncheckedIOException; import java.nio.ByteBuffer; import java.util.Collections; import java.util.List; @@ -35,12 +38,16 @@ import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.TableProperties; +import org.apache.iceberg.data.vortex.PositionDeleteVortexWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteIndex; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.BaseFormatModel; import org.apache.iceberg.formats.ModelWriteBuilder; import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.OutputFile; @@ -48,6 +55,7 @@ import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.types.Types; +import org.roaringbitmap.longlong.Roaring64NavigableMap; public class VortexFormatModel extends BaseFormatModel, R, Schema> { @@ -93,6 +101,11 @@ public static VortexFormatModel> create( true); } + public static + VortexFormatModel, Void, VortexRowReader> forPositionDeletes() { + return new VortexFormatModel<>(PositionDelete.deleteClass(), Void.class, null, null, false); + } + private VortexFormatModel( Class type, Class schemaType, @@ -206,14 +219,17 @@ public ModelWriteBuilder withAADPrefix(ByteBuffer aadPrefix) { @Override public FileAppender build() throws IOException { - Preconditions.checkNotNull(schema, "Schema is required"); Preconditions.checkNotNull(content, "Content type is required"); return switch (content) { - case DATA, EQUALITY_DELETES -> buildAppender(schema); - case POSITION_DELETES, DATA_MANIFEST, DELETE_MANIFEST -> + case DATA, EQUALITY_DELETES -> { + Preconditions.checkNotNull(schema, "Schema is required"); + yield buildAppender(schema); + } + case POSITION_DELETES -> buildPosDeleteAppender(); + case DATA_MANIFEST, DELETE_MANIFEST -> throw new UnsupportedOperationException( - "Position deletes are not yet supported for Vortex format"); + "Manifest files are not supported for Vortex format"); }; } @@ -253,6 +269,42 @@ private FileAppender buildAppender(org.apache.iceberg.Schema writeSchema) writeSchema, metricsConfig); } + + @SuppressWarnings("unchecked") + private FileAppender buildPosDeleteAppender() throws IOException { + org.apache.iceberg.Schema posDeleteSchema = DeleteSchemaUtil.pathPosSchema(); + Schema arrowSchema = VortexSchemas.toArrowSchema(posDeleteSchema); + dev.vortex.relocated.org.apache.arrow.vector.types.pojo.Schema vortexSchema = + VortexSchemas.toVortexArrowSchema(posDeleteSchema); + + VortexValueWriter valueWriter = (VortexValueWriter) new PositionDeleteVortexWriter<>(); + + OutputFile rawOutputFile = outputFile.encryptingOutputFile(); + String uri = VortexFileUtil.resolveUri(rawOutputFile.location()); + Map properties = + Maps.newHashMap(VortexFileUtil.resolveOutputProperties(rawOutputFile)); + properties.putAll(writerProperties); + properties.putAll(metadata); + + // Apply worker-thread setting on this executor JVM before any Vortex native work begins. + NativeRuntime.setWorkerThreads(workerThreads); + BufferAllocator allocator = VortexArrowBridge.arrowAllocator(); + dev.vortex.relocated.org.apache.arrow.memory.BufferAllocator vortexAllocator = + VortexArrowBridge.vortexAllocator(); + Session session = Session.create(); + VortexWriter vortexWriter = + VortexWriter.create(session, uri, vortexSchema, properties, vortexAllocator); + + return new VortexFileAppender<>( + vortexWriter, + valueWriter, + arrowSchema, + allocator, + VortexFileAppender.DEFAULT_BATCH_SIZE, + rawOutputFile, + posDeleteSchema, + metricsConfig); + } } private static class ReadBuilderWrapper implements ReadBuilder { @@ -265,6 +317,7 @@ private static class ReadBuilderWrapper implements ReadBuilder { private Optional filterPredicate = Optional.empty(); private boolean caseSensitive = true; private long[] rowRange; + private PositionDeleteIndex posDeletes; private int workerThreads = TableProperties.VORTEX_WORKER_THREADS_DEFAULT; private ReadBuilderWrapper( @@ -306,6 +359,17 @@ public ReadBuilder filter(Expression filter) { return this; } + @Override + public boolean supportsPositionDeletes() { + return true; + } + + @Override + public ReadBuilder positionDeletes(PositionDeleteIndex deletes) { + this.posDeletes = deletes; + return this; + } + @Override public ReadBuilder set(String key, String value) { if (TableProperties.READ_VORTEX_WORKER_THREADS.equals(key)) { @@ -374,15 +438,37 @@ public CloseableIterable build() { .map(Types.NestedField::name) .toList(); + byte[] posDeleteBitmap = posDeletes == null ? null : toRoaringBitmap(posDeletes); + return new VortexIterable<>( inputFile, projection, filterPredicate, rowRange, + posDeleteBitmap, readerFunc, batchReaderFunc, caseSensitive, workerThreads); } + + /** + * Serializes the deleted row positions as a portable 64-bit Roaring bitmap, the form Vortex + * expects for {@code EXCLUDE_ROARING} row selection (matching {@code + * Roaring64NavigableMap.serializePortable}). + */ + private static byte[] toRoaringBitmap(PositionDeleteIndex deletes) { + Roaring64NavigableMap bitmap = new Roaring64NavigableMap(); + deletes.forEach(bitmap::addLong); + bitmap.runOptimize(); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + DataOutputStream dataOut = new DataOutputStream(out)) { + bitmap.serializePortable(dataOut); + dataOut.flush(); + return out.toByteArray(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } } } diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModels.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModels.java index 8d7f21309ba2..ec9b630420b6 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModels.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModels.java @@ -32,6 +32,8 @@ public static void register() { (icebergSchema, fileSchema, engineSchema) -> GenericVortexWriter.buildWriter(icebergSchema), (VortexFormatModel.ReaderFunction) GenericVortexReader::buildReader)); + + FormatModelRegistry.register(VortexFormatModel.forPositionDeletes()); } private VortexFormatModels() {} diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java index 4e4453a4bf24..a33de2f77fab 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java @@ -55,6 +55,7 @@ public class VortexIterable extends CloseableGroup implements CloseableIterab private final InputFile inputFile; private final Optional filterPredicate; private final long[] rowRange; + private final byte[] posDeleteBitmap; private final Function> rowReaderFunc; private final Function> @@ -68,6 +69,7 @@ public class VortexIterable extends CloseableGroup implements CloseableIterab List projection, Optional filterPredicate, long[] rowRange, + byte[] posDeleteBitmap, Function> readerFunction, Function> batchReaderFunction, boolean caseSensitive, @@ -76,6 +78,7 @@ public class VortexIterable extends CloseableGroup implements CloseableIterab this.projection = projection; this.filterPredicate = filterPredicate; this.rowRange = rowRange; + this.posDeleteBitmap = posDeleteBitmap; this.rowReaderFunc = readerFunction; this.batchReaderFunction = batchReaderFunction; this.caseSensitive = caseSensitive; @@ -146,6 +149,15 @@ public CloseableIterator iterator() { optionsBuilder.rowRangeBegin(rowRange[0]).rowRangeEnd(rowRange[1]); } + // Apply position deletes natively: the bitmap holds file-relative row positions of deleted rows + // (portable 64-bit Roaring), and Vortex drops them from the scan so they are never + // materialized. + if (posDeleteBitmap != null) { + optionsBuilder + .selectionRoaringBitmap(posDeleteBitmap) + .selectionMode(ScanOptions.SelectionMode.EXCLUDE_ROARING); + } + Scan scan = dataSource.scan(optionsBuilder.build()); Preconditions.checkNotNull(scan, "scan"); diff --git a/vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java b/vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java new file mode 100644 index 000000000000..5000eb62a0ae --- /dev/null +++ b/vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java @@ -0,0 +1,155 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.vortex; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.Files; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.vortex.GenericVortexReader; +import org.apache.iceberg.data.vortex.GenericVortexWriter; +import org.apache.iceberg.deletes.Deletes; +import org.apache.iceberg.deletes.PositionDeleteIndex; +import org.apache.iceberg.encryption.EncryptedFiles; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.types.Types.StructType; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +/** Verifies that position deletes are pushed into the Vortex scan and exclude deleted rows. */ +public class TestVortexPositionDeletes { + private static final Schema SCHEMA = + new Schema( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + @TempDir private Path temp; + + @Test + public void testPositionDeletesExcludeRows() throws IOException { + InputFile file = writeRows(10); + + // Delete row positions 2, 5 and 7 (ids 2, 5, 7). + PositionDeleteIndex deletes = + Deletes.toPositionIndex(CloseableIterable.withNoopClose(Lists.newArrayList(2L, 5L, 7L))); + + assertThat(readIds(file, deletes)) + .as("Pushed-down position deletes should remove exactly the deleted rows") + .containsExactlyInAnyOrder(0, 1, 3, 4, 6, 8, 9); + } + + @Test + public void testNoPositionDeletesReadsAllRows() throws IOException { + InputFile file = writeRows(10); + + assertThat(readIds(file, null)) + .as("Without position deletes every row should be returned") + .containsExactlyInAnyOrder(0, 1, 2, 3, 4, 5, 6, 7, 8, 9); + } + + @Test + public void testFirstAndLastPositionsDeleted() throws IOException { + InputFile file = writeRows(5); + + PositionDeleteIndex deletes = + Deletes.toPositionIndex(CloseableIterable.withNoopClose(Lists.newArrayList(0L, 4L))); + + assertThat(readIds(file, deletes)).containsExactlyInAnyOrder(1, 2, 3); + } + + @Test + public void testPositionDeletesAppliedAlongsideRowRange() throws IOException { + // GenericReader sets a row range via split(); for the row-splittable Vortex format the planner + // reports these in rows (file.recordCount()), so this mirrors a whole-file task [0, rowCount). + // Positions in the bitmap are file-relative and must still hit the right rows. + int rows = 10; + InputFile file = writeRows(rows); + + PositionDeleteIndex deletes = + Deletes.toPositionIndex(CloseableIterable.withNoopClose(Lists.newArrayList(2L, 5L, 7L))); + + List ids = Lists.newArrayList(); + try (CloseableIterable reader = + formatModel() + .readBuilder(file) + .project(SCHEMA) + .split(0, rows) + .positionDeletes(deletes) + .build()) { + for (Record record : reader) { + ids.add((Integer) record.getField("id")); + } + } + + assertThat(ids).containsExactlyInAnyOrder(0, 1, 3, 4, 6, 8, 9); + } + + private InputFile writeRows(int count) throws IOException { + OutputFile outputFile = + Files.localOutput(temp.resolve("data-" + System.nanoTime() + ".vortex").toFile()); + List records = Lists.newArrayListWithCapacity(count); + for (int i = 0; i < count; i++) { + GenericRecord record = GenericRecord.create(SCHEMA); + record.setField("id", i); + record.setField("data", "val-" + i); + records.add(record); + } + + try (FileAppender appender = + formatModel() + .writeBuilder(EncryptedFiles.plainAsEncryptedOutput(outputFile)) + .schema(SCHEMA) + .content(FileContent.DATA) + .build()) { + appender.addAll(records); + } + + return outputFile.toInputFile(); + } + + private List readIds(InputFile file, PositionDeleteIndex deletes) throws IOException { + List ids = Lists.newArrayList(); + try (CloseableIterable reader = + formatModel().readBuilder(file).project(SCHEMA).positionDeletes(deletes).build()) { + for (Record record : reader) { + ids.add((Integer) record.getField("id")); + } + } + return ids; + } + + private static VortexFormatModel> formatModel() { + return VortexFormatModel.create( + Record.class, + StructType.class, + (icebergSchema, fileSchema, engineSchema) -> GenericVortexWriter.buildWriter(icebergSchema), + (VortexFormatModel.ReaderFunction) GenericVortexReader::buildReader); + } +} From cacb29acf5e41299ad9287da6d813480f24f1224 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 8 Jun 2026 17:46:52 +0100 Subject: [PATCH 2/9] Vortex: convert struct columns in Arrow-to-Iceberg schema conversion VortexSchemas.convert (used to bind scan filters in VortexIterable) threw on struct columns because toIcebergType had no Struct branch. Recurse into struct children, assigning unique field ids via a shared counter (also fixes the latent duplicate-id bug where list elements were hardcoded to id 0). Map stays unsupported. This unblocks reading Vortex tables whose schema contains structs through the generic reader (which always binds the residual filter), so the full DeleteReadTests suite now runs for Vortex via TestVortexReaderDeletes (data + v2 position-delete files + v3 DVs, written through the standard GenericFileWriterFactory/registry path). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../iceberg/data/TestVortexReaderDeletes.java | 90 +++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 data/src/test/java/org/apache/iceberg/data/TestVortexReaderDeletes.java diff --git a/data/src/test/java/org/apache/iceberg/data/TestVortexReaderDeletes.java b/data/src/test/java/org/apache/iceberg/data/TestVortexReaderDeletes.java new file mode 100644 index 000000000000..48a7dc416d2e --- /dev/null +++ b/data/src/test/java/org/apache/iceberg/data/TestVortexReaderDeletes.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import java.io.File; +import java.io.IOException; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.ParameterizedTestExtension; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TableProperties; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; +import org.apache.iceberg.types.Types; +import org.apache.iceberg.util.StructLikeSet; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.jupiter.api.io.TempDir; + +/** + * Exercises the full delete-read path for Vortex tables: data and (v2) position-delete files are + * written through the standard {@link GenericFileWriterFactory} (which routes Vortex through the + * {@link org.apache.iceberg.formats.FormatModelRegistry}), and reads apply position deletes via + * native scan pushdown. + */ +@ExtendWith(ParameterizedTestExtension.class) +public class TestVortexReaderDeletes extends DeleteReadTests { + @TempDir private File tableDir; + + @Parameters(name = "fileFormat = {0}, formatVersion = {1}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {FileFormat.VORTEX, 2}, + new Object[] {FileFormat.VORTEX, 3}, + }; + } + + @Override + protected Table createTable(String name, Schema schema, PartitionSpec spec) throws IOException { + return TestTables.create( + tableDir, + name, + schema, + spec, + formatVersion, + ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, FileFormat.VORTEX.name())); + } + + @Override + protected void dropTable(String name) { + TestTables.clearTables(); + } + + @Override + public StructLikeSet rowSet(String name, Table table, String... columns) throws IOException { + Types.StructType schema = table.schema().select(columns).asStruct(); + StructLikeSet set = StructLikeSet.create(schema); + try (CloseableIterable reader = IcebergGenerics.read(table).select(columns).build()) { + Iterables.addAll( + set, + CloseableIterable.transform( + reader, record -> new InternalRecordWrapper(schema).wrap(record))); + } + return set; + } + + @Override + protected boolean expectPruned() { + return false; + } +} From b9be505536900c30449773231835d336f4dd7e68 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 15 Jun 2026 14:57:47 +0100 Subject: [PATCH 3/9] positional delete support in spark --- .../TestVortexMergeOnReadDelete.java | 91 ++++++++++ .../iceberg/spark/source/BaseBatchReader.java | 42 +++++ .../source/TestSparkVortexReaderDeletes.java | 153 ++++++++++++++++ .../TestVortexMergeOnReadDelete.java | 91 ++++++++++ .../iceberg/spark/source/BaseBatchReader.java | 42 +++++ .../source/TestSparkVortexReaderDeletes.java | 153 ++++++++++++++++ .../TestVortexMergeOnReadDelete.java | 91 ++++++++++ .../iceberg/spark/source/BaseBatchReader.java | 42 +++++ .../source/TestSparkVortexReaderDeletes.java | 165 ++++++++++++++++++ .../iceberg/vortex/VortexArrowBridge.java | 56 +++++- .../iceberg/vortex/VortexFormatModel.java | 9 +- .../apache/iceberg/vortex/VortexIterable.java | 25 ++- .../vortex/TestVortexPositionDeletes.java | 46 +++++ 13 files changed, 1001 insertions(+), 5 deletions(-) create mode 100644 spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java create mode 100644 spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java create mode 100644 spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java create mode 100644 spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java create mode 100644 spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java create mode 100644 spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java new file mode 100644 index 000000000000..95d21908bf7b --- /dev/null +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.extensions; + +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DELETE_MODE; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + +import org.apache.iceberg.Parameters; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; + +/** + * End-to-end merge-on-read DELETE coverage for Vortex tables. A DELETE reads the data with the + * synthetic {@code _pos} column (wired through Vortex's {@code row_idx} expression) to compute the + * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion vector + * for v3) through the format-model registry, and the subsequent read excludes the deleted rows via + * native scan pushdown. + */ +public class TestVortexMergeOnReadDelete extends ExtensionsTestBase { + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + public static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + } + }; + } + + @AfterEach + public void removeTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + private void runMergeOnReadDelete(int formatVersion) { + sql( + "CREATE TABLE %s (id INT, dep STRING) USING iceberg " + + "TBLPROPERTIES ('%s'='vortex', '%s'='%d', '%s'='merge-on-read')", + tableName, DEFAULT_FILE_FORMAT, FORMAT_VERSION, formatVersion, DELETE_MODE); + + sql( + "INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", + tableName); + + sql("DELETE FROM %s WHERE id IN (2, 4)", tableName); + + assertEquals( + "Merge-on-read DELETE should exclude exactly the deleted rows", + ImmutableList.of(row(1, "a"), row(3, "c"), row(5, "e")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + + // A second DELETE produces another set of positions against the same data file. + sql("DELETE FROM %s WHERE id = 5", tableName); + + assertEquals( + "Subsequent merge-on-read DELETE should remove additional rows", + ImmutableList.of(row(1, "a"), row(3, "c")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + } + + @TestTemplate + public void testMergeOnReadDeleteFormatV2() { + runMergeOnReadDelete(2); + } + + @TestTemplate + public void testMergeOnReadDeleteFormatV3() { + runMergeOnReadDelete(3); + } +} diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 505e64989ef4..79d136a48b22 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -34,6 +34,7 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; @@ -93,6 +94,14 @@ protected CloseableIterable newBatchIterable( readBuilder = readBuilder.recordsPerBatch(vortexConf.batchSize()); } + if (readBuilder.supportsPositionDeletes()) { + // Vortex applies position deletes and residual filters natively inside the scan, so the + // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and + // is unsound once rows are filtered out during the scan) is bypassed entirely. + return newPushdownBatchIterable( + readBuilder, start, length, residual, idToConstant, deleteFilter); + } + CloseableIterable iterable = readBuilder .project(deleteFilter.requiredSchema()) @@ -111,6 +120,39 @@ protected CloseableIterable newBatchIterable( return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); } + // Reads from a format that applies position deletes (and residual filters) natively in the scan. + // Position deletes are pushed down so deleted rows are never materialized; only the expected + // output columns are projected. Equality deletes and the _deleted metadata column combined with + // delete files require post-scan processing that this path does not perform and are rejected. + private CloseableIterable newPushdownBatchIterable( + ReadBuilder readBuilder, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + boolean isDeletedProjected = + deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; + boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); + Preconditions.checkArgument( + !deleteFilter.hasEqDeletes(), "Equality deletes are not supported for Vortex reads"); + Preconditions.checkArgument( + !(isDeletedProjected && hasDeletes), + "The _deleted metadata column with delete files is not supported for Vortex reads"); + + deleteFilter.pushablePosDeletes().ifPresent(readBuilder::positionDeletes); + + return readBuilder + .project(deleteFilter.expectedSchema()) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .reuseContainers() + .withNameMapping(nameMapping()) + .build(); + } + @VisibleForTesting static class BatchDeleteFilter { private final DeleteFilter deletes; diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java new file mode 100644 index 000000000000..c1b87cc652fd --- /dev/null +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PlanningMode; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.TestTemplate; + +/** + * Exercises the Spark columnar read path for Vortex tables with position deletes. + * + *

Vortex applies position deletes and residual filters natively in the scan (see {@code + * BaseBatchReader.newPushdownBatchIterable}), so the position-delete cases inherited from {@link + * TestSparkReaderDeletes} run unchanged. The equality-delete and {@code _deleted}-with-delete-files + * cases are skipped: the native pushdown path intentionally does not perform the post-scan + * processing those require. + */ +public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { + + @Parameters(name = "fileFormat = {0}, formatVersion = {1}, vectorized = {2}, planningMode = {3}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {FileFormat.VORTEX, 2, true, PlanningMode.DISTRIBUTED}, + new Object[] {FileFormat.VORTEX, 3, true, PlanningMode.LOCAL}, + }; + } + + // Position deletes are dropped inside the Vortex scan, so they never reach Spark and are not + // reflected in the NumDeletes metric. Disable delete-count assertions for this path. + @Override + protected boolean countDeletes() { + return false; + } + + private static void skipUnsupported() { + Assumptions.abort( + "Vortex columnar reads apply position deletes and filters via native scan pushdown; " + + "equality deletes and the _deleted metadata column with delete files are not " + + "supported on this path"); + } + + // --- equality-delete cases inherited from DeleteReadTests --- + + @TestTemplate + @Override + public void testEqualityDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDateDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeletesWithRequiredEqColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeletesSpanningMultipleDataFiles() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMixedPositionAndEqualityDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMultipleEqualityDeleteSchemas() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteByNull() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteBinaryColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteStructColumn() { + skipUnsupported(); + } + + // --- equality-delete and _deleted cases from TestSparkReaderDeletes --- + + @TestTemplate + @Override + public void testEqualityDeleteWithFilter() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testReadEqualityDeleteRows() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testPosDeletesWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMixedPosAndEqDeletesWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testFilterOnDeletedMetadataColumn() { + skipUnsupported(); + } +} diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java new file mode 100644 index 000000000000..95d21908bf7b --- /dev/null +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.extensions; + +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DELETE_MODE; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + +import org.apache.iceberg.Parameters; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; + +/** + * End-to-end merge-on-read DELETE coverage for Vortex tables. A DELETE reads the data with the + * synthetic {@code _pos} column (wired through Vortex's {@code row_idx} expression) to compute the + * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion vector + * for v3) through the format-model registry, and the subsequent read excludes the deleted rows via + * native scan pushdown. + */ +public class TestVortexMergeOnReadDelete extends ExtensionsTestBase { + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + public static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + } + }; + } + + @AfterEach + public void removeTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + private void runMergeOnReadDelete(int formatVersion) { + sql( + "CREATE TABLE %s (id INT, dep STRING) USING iceberg " + + "TBLPROPERTIES ('%s'='vortex', '%s'='%d', '%s'='merge-on-read')", + tableName, DEFAULT_FILE_FORMAT, FORMAT_VERSION, formatVersion, DELETE_MODE); + + sql( + "INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", + tableName); + + sql("DELETE FROM %s WHERE id IN (2, 4)", tableName); + + assertEquals( + "Merge-on-read DELETE should exclude exactly the deleted rows", + ImmutableList.of(row(1, "a"), row(3, "c"), row(5, "e")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + + // A second DELETE produces another set of positions against the same data file. + sql("DELETE FROM %s WHERE id = 5", tableName); + + assertEquals( + "Subsequent merge-on-read DELETE should remove additional rows", + ImmutableList.of(row(1, "a"), row(3, "c")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + } + + @TestTemplate + public void testMergeOnReadDeleteFormatV2() { + runMergeOnReadDelete(2); + } + + @TestTemplate + public void testMergeOnReadDeleteFormatV3() { + runMergeOnReadDelete(3); + } +} diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 505e64989ef4..79d136a48b22 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -34,6 +34,7 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; @@ -93,6 +94,14 @@ protected CloseableIterable newBatchIterable( readBuilder = readBuilder.recordsPerBatch(vortexConf.batchSize()); } + if (readBuilder.supportsPositionDeletes()) { + // Vortex applies position deletes and residual filters natively inside the scan, so the + // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and + // is unsound once rows are filtered out during the scan) is bypassed entirely. + return newPushdownBatchIterable( + readBuilder, start, length, residual, idToConstant, deleteFilter); + } + CloseableIterable iterable = readBuilder .project(deleteFilter.requiredSchema()) @@ -111,6 +120,39 @@ protected CloseableIterable newBatchIterable( return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); } + // Reads from a format that applies position deletes (and residual filters) natively in the scan. + // Position deletes are pushed down so deleted rows are never materialized; only the expected + // output columns are projected. Equality deletes and the _deleted metadata column combined with + // delete files require post-scan processing that this path does not perform and are rejected. + private CloseableIterable newPushdownBatchIterable( + ReadBuilder readBuilder, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + boolean isDeletedProjected = + deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; + boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); + Preconditions.checkArgument( + !deleteFilter.hasEqDeletes(), "Equality deletes are not supported for Vortex reads"); + Preconditions.checkArgument( + !(isDeletedProjected && hasDeletes), + "The _deleted metadata column with delete files is not supported for Vortex reads"); + + deleteFilter.pushablePosDeletes().ifPresent(readBuilder::positionDeletes); + + return readBuilder + .project(deleteFilter.expectedSchema()) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .reuseContainers() + .withNameMapping(nameMapping()) + .build(); + } + @VisibleForTesting static class BatchDeleteFilter { private final DeleteFilter deletes; diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java new file mode 100644 index 000000000000..c1b87cc652fd --- /dev/null +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PlanningMode; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.TestTemplate; + +/** + * Exercises the Spark columnar read path for Vortex tables with position deletes. + * + *

Vortex applies position deletes and residual filters natively in the scan (see {@code + * BaseBatchReader.newPushdownBatchIterable}), so the position-delete cases inherited from {@link + * TestSparkReaderDeletes} run unchanged. The equality-delete and {@code _deleted}-with-delete-files + * cases are skipped: the native pushdown path intentionally does not perform the post-scan + * processing those require. + */ +public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { + + @Parameters(name = "fileFormat = {0}, formatVersion = {1}, vectorized = {2}, planningMode = {3}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {FileFormat.VORTEX, 2, true, PlanningMode.DISTRIBUTED}, + new Object[] {FileFormat.VORTEX, 3, true, PlanningMode.LOCAL}, + }; + } + + // Position deletes are dropped inside the Vortex scan, so they never reach Spark and are not + // reflected in the NumDeletes metric. Disable delete-count assertions for this path. + @Override + protected boolean countDeletes() { + return false; + } + + private static void skipUnsupported() { + Assumptions.abort( + "Vortex columnar reads apply position deletes and filters via native scan pushdown; " + + "equality deletes and the _deleted metadata column with delete files are not " + + "supported on this path"); + } + + // --- equality-delete cases inherited from DeleteReadTests --- + + @TestTemplate + @Override + public void testEqualityDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDateDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeletesWithRequiredEqColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeletesSpanningMultipleDataFiles() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMixedPositionAndEqualityDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMultipleEqualityDeleteSchemas() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteByNull() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteBinaryColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteStructColumn() { + skipUnsupported(); + } + + // --- equality-delete and _deleted cases from TestSparkReaderDeletes --- + + @TestTemplate + @Override + public void testEqualityDeleteWithFilter() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testReadEqualityDeleteRows() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testPosDeletesWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMixedPosAndEqDeletesWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testFilterOnDeletedMetadataColumn() { + skipUnsupported(); + } +} diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java new file mode 100644 index 000000000000..95d21908bf7b --- /dev/null +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.extensions; + +import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT; +import static org.apache.iceberg.TableProperties.DELETE_MODE; +import static org.apache.iceberg.TableProperties.FORMAT_VERSION; + +import org.apache.iceberg.Parameters; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.spark.SparkCatalogConfig; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.TestTemplate; + +/** + * End-to-end merge-on-read DELETE coverage for Vortex tables. A DELETE reads the data with the + * synthetic {@code _pos} column (wired through Vortex's {@code row_idx} expression) to compute the + * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion vector + * for v3) through the format-model registry, and the subsequent read excludes the deleted rows via + * native scan pushdown. + */ +public class TestVortexMergeOnReadDelete extends ExtensionsTestBase { + + @Parameters(name = "catalogName = {0}, implementation = {1}, config = {2}") + public static Object[][] parameters() { + return new Object[][] { + { + SparkCatalogConfig.HADOOP.catalogName(), + SparkCatalogConfig.HADOOP.implementation(), + SparkCatalogConfig.HADOOP.properties() + } + }; + } + + @AfterEach + public void removeTable() { + sql("DROP TABLE IF EXISTS %s", tableName); + } + + private void runMergeOnReadDelete(int formatVersion) { + sql( + "CREATE TABLE %s (id INT, dep STRING) USING iceberg " + + "TBLPROPERTIES ('%s'='vortex', '%s'='%d', '%s'='merge-on-read')", + tableName, DEFAULT_FILE_FORMAT, FORMAT_VERSION, formatVersion, DELETE_MODE); + + sql( + "INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", + tableName); + + sql("DELETE FROM %s WHERE id IN (2, 4)", tableName); + + assertEquals( + "Merge-on-read DELETE should exclude exactly the deleted rows", + ImmutableList.of(row(1, "a"), row(3, "c"), row(5, "e")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + + // A second DELETE produces another set of positions against the same data file. + sql("DELETE FROM %s WHERE id = 5", tableName); + + assertEquals( + "Subsequent merge-on-read DELETE should remove additional rows", + ImmutableList.of(row(1, "a"), row(3, "c")), + sql("SELECT * FROM %s ORDER BY id", tableName)); + } + + @TestTemplate + public void testMergeOnReadDeleteFormatV2() { + runMergeOnReadDelete(2); + } + + @TestTemplate + public void testMergeOnReadDeleteFormatV3() { + runMergeOnReadDelete(3); + } +} diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index fe0086caa232..8ff2e280fef8 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -34,6 +34,7 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; @@ -85,6 +86,14 @@ protected CloseableIterable newBatchIterable( readBuilder = readBuilder.recordsPerBatch(vortexConf.batchSize()); } + if (readBuilder.supportsPositionDeletes()) { + // Vortex applies position deletes and residual filters natively inside the scan, so the + // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and + // is unsound once rows are filtered out during the scan) is bypassed entirely. + return newPushdownBatchIterable( + readBuilder, start, length, residual, idToConstant, deleteFilter); + } + CloseableIterable iterable = readBuilder .project(deleteFilter.requiredSchema()) @@ -103,6 +112,39 @@ protected CloseableIterable newBatchIterable( return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); } + // Reads from a format that applies position deletes (and residual filters) natively in the scan. + // Position deletes are pushed down so deleted rows are never materialized; only the expected + // output columns are projected. Equality deletes and the _deleted metadata column combined with + // delete files require post-scan processing that this path does not perform and are rejected. + private CloseableIterable newPushdownBatchIterable( + ReadBuilder readBuilder, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + boolean isDeletedProjected = + deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; + boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); + Preconditions.checkArgument( + !deleteFilter.hasEqDeletes(), "Equality deletes are not supported for Vortex reads"); + Preconditions.checkArgument( + !(isDeletedProjected && hasDeletes), + "The _deleted metadata column with delete files is not supported for Vortex reads"); + + deleteFilter.pushablePosDeletes().ifPresent(readBuilder::positionDeletes); + + return readBuilder + .project(deleteFilter.expectedSchema()) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .reuseContainers() + .withNameMapping(nameMapping()) + .build(); + } + @VisibleForTesting static class BatchDeleteFilter { private final DeleteFilter deletes; diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java new file mode 100644 index 000000000000..f77a9189cc25 --- /dev/null +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Parameters; +import org.apache.iceberg.PlanningMode; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.TestTemplate; + +/** + * Exercises the Spark columnar read path for Vortex tables with position deletes. + * + *

Vortex applies position deletes and residual filters natively in the scan (see {@code + * BaseBatchReader.newPushdownBatchIterable}), so the position-delete cases inherited from {@link + * TestSparkReaderDeletes} run unchanged. The equality-delete and {@code _deleted}-with-delete-files + * cases are skipped: the native pushdown path intentionally does not perform the post-scan + * processing those require. + */ +public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { + + @Parameters(name = "fileFormat = {0}, formatVersion = {1}, vectorized = {2}, planningMode = {3}") + public static Object[][] parameters() { + return new Object[][] { + new Object[] {FileFormat.VORTEX, 2, true, PlanningMode.DISTRIBUTED}, + new Object[] {FileFormat.VORTEX, 3, true, PlanningMode.LOCAL}, + }; + } + + // Position deletes are dropped inside the Vortex scan, so they never reach Spark and are not + // reflected in the NumDeletes metric. Disable delete-count assertions for this path. + @Override + protected boolean countDeletes() { + return false; + } + + private static void skipUnsupported() { + Assumptions.abort( + "Vortex columnar reads apply position deletes and filters via native scan pushdown; " + + "equality deletes and the _deleted metadata column with delete files are not " + + "supported on this path"); + } + + // --- equality-delete cases inherited from DeleteReadTests --- + + @TestTemplate + @Override + public void testEqualityDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDateDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeletesWithRequiredEqColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeletesSpanningMultipleDataFiles() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMixedPositionAndEqualityDeletes() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMultipleEqualityDeleteSchemas() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteByNull() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteBinaryColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteStructColumn() { + skipUnsupported(); + } + + // --- equality-delete and _deleted cases from TestSparkReaderDeletes --- + + @TestTemplate + @Override + public void testEqualityDeleteWithFilter() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testReadEqualityDeleteRows() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testPosDeletesWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testMixedPosAndEqDeletesWithDeletedColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testFilterOnDeletedMetadataColumn() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeleteWithSchemaEvolution() { + skipUnsupported(); + } + + @TestTemplate + @Override + public void testEqualityDeletesAppliedWithCachedFieldReordering() { + skipUnsupported(); + } +} diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java index 53bc9cb592b0..98512d4f7757 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java @@ -19,11 +19,16 @@ package org.apache.iceberg.vortex; import dev.vortex.arrow.ArrowAllocation; +import java.util.ArrayList; +import java.util.List; import org.apache.arrow.c.ArrowArray; import org.apache.arrow.c.ArrowSchema; import org.apache.arrow.c.Data; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VectorSchemaRoot; final class VortexArrowBridge { @@ -52,8 +57,57 @@ static VectorSchemaRoot importVortexRoot( try (ArrowArray arrowArray = ArrowArray.wrap(vortexArray.memoryAddress())) { ArrowSchema arrowSchema = ArrowSchema.wrap(vortexSchema.memoryAddress()); - return Data.importVectorSchemaRoot(arrowAllocator, arrowArray, arrowSchema, null); + VectorSchemaRoot imported = + Data.importVectorSchemaRoot(arrowAllocator, arrowArray, arrowSchema, null); + return normalizeUnsignedLongs(imported, arrowAllocator); } } } + + /** + * Rewrites unsigned 64-bit integer columns as signed ones. Vortex's {@code row_idx} expression + * (used to materialize the {@code _pos} metadata column) produces an unsigned int64 column, which + * Spark's {@code ArrowColumnVector} cannot read ({@code Int(64, false)} is unsupported). Row + * positions always fit in a signed long, so the values are copied verbatim into a signed vector. + * The original imported root is left untouched unless an unsigned column is present. + */ + private static VectorSchemaRoot normalizeUnsignedLongs( + VectorSchemaRoot imported, BufferAllocator allocator) { + boolean hasUnsigned = + imported.getFieldVectors().stream().anyMatch(vector -> vector instanceof UInt8Vector); + if (!hasUnsigned) { + return imported; + } + + List vectors = new ArrayList<>(imported.getFieldVectors().size()); + for (FieldVector vector : imported.getFieldVectors()) { + if (vector instanceof UInt8Vector) { + vectors.add(copyAsSigned((UInt8Vector) vector, allocator)); + // The unsigned source is replaced by the signed copy and is not part of the returned root, + // so release its buffers now. Remaining vectors are moved into the new root unchanged. + vector.close(); + } else { + vectors.add(vector); + } + } + + VectorSchemaRoot result = new VectorSchemaRoot(vectors); + result.setRowCount(imported.getRowCount()); + return result; + } + + private static BigIntVector copyAsSigned(UInt8Vector source, BufferAllocator allocator) { + int count = source.getValueCount(); + BigIntVector signed = new BigIntVector(source.getField().getName(), allocator); + signed.allocateNew(count); + for (int i = 0; i < count; i++) { + if (source.isNull(i)) { + signed.setNull(i); + } else { + signed.set(i, source.getValueAsLong(i)); + } + } + signed.setValueCount(count); + return signed; + } } diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java index f4f18af6ff31..23288531f6d5 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java @@ -425,8 +425,8 @@ public CloseableIterable build() { // Compute the columns to scan from the data file. Constants (identity partition values and // metadata columns such as _file, _spec_id and _partition) come from idToConstant, and // _is_deleted is synthesized by the reader, so none of those are projected from the file. - // _pos is also excluded and currently resolves to null: Vortex exposes row positions through - // a `row_idx` scan expression that the Java bindings (<= 0.73.0) do not yet surface. + // _pos is excluded here too, but when it is requested it is materialized separately from + // Vortex's `row_idx` scan expression (see VortexIterable) rather than read from the file. Map constants = idToConstant == null ? Collections.emptyMap() : idToConstant; List projection = schema.columns().stream() @@ -438,6 +438,10 @@ public CloseableIterable build() { .map(Types.NestedField::name) .toList(); + boolean includeRowPosition = + schema.findField(MetadataColumns.ROW_POSITION.fieldId()) != null + && !constants.containsKey(MetadataColumns.ROW_POSITION.fieldId()); + byte[] posDeleteBitmap = posDeletes == null ? null : toRoaringBitmap(posDeletes); return new VortexIterable<>( @@ -446,6 +450,7 @@ public CloseableIterable build() { filterPredicate, rowRange, posDeleteBitmap, + includeRowPosition, readerFunc, batchReaderFunc, caseSensitive, diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java index a33de2f77fab..4b881c9b2673 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java @@ -26,6 +26,8 @@ import dev.vortex.api.Session; import dev.vortex.jni.NativeRuntime; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; @@ -36,7 +38,9 @@ import java.util.stream.Collectors; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.expressions.Expression; @@ -126,6 +130,7 @@ public CloseableIterator iterator() { ImmutableList.Builder fieldNames = ImmutableList.builder(); ImmutableList.Builder expressions = ImmutableList.builder(); + org.apache.arrow.vector.types.pojo.Schema readerArrowSchema = fileArrowSchema; for (String name : projection) { if (fileColumns.contains(name)) { @@ -134,6 +139,7 @@ public CloseableIterator iterator() { } else if (Objects.equals(name, MetadataColumns.ROW_POSITION.name())) { fieldNames.add(name); expressions.add(dev.vortex.api.Expression.rowIdx()); + readerArrowSchema = appendRowPosition(fileArrowSchema); } } @@ -165,14 +171,29 @@ public CloseableIterator iterator() { new PartitionBatchIterator(scan, vortexAllocator, allocator); if (rowReaderFunc != null) { - VortexRowReader rowFunction = rowReaderFunc.apply(fileArrowSchema); + VortexRowReader rowFunction = rowReaderFunc.apply(readerArrowSchema); return new VortexRowIterator<>(batchIterator, rowFunction); } else { - VortexBatchReader batchTransform = batchReaderFunction.apply(fileArrowSchema); + VortexBatchReader batchTransform = batchReaderFunction.apply(readerArrowSchema); return new VortexBatchIterator<>(batchIterator, batchTransform); } } + /** + * Appends a required {@code _pos} (int64) field to an Arrow schema so readers can bind the + * synthetic row-position column produced by the {@code row_idx} scan projection. + */ + private static org.apache.arrow.vector.types.pojo.Schema appendRowPosition( + org.apache.arrow.vector.types.pojo.Schema base) { + List fields = new ArrayList<>(base.getFields()); + fields.add( + new Field( + MetadataColumns.ROW_POSITION.name(), + new FieldType(false, new ArrowType.Int(Long.SIZE, true), null), + null)); + return new org.apache.arrow.vector.types.pojo.Schema(fields, base.getCustomMetadata()); + } + /** Iterator that pulls Arrow {@link VectorSchemaRoot} batches across Vortex partitions. */ static class PartitionBatchIterator implements CloseableIterator { private final Scan scan; diff --git a/vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java b/vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java index 5000eb62a0ae..31dfa082c84d 100644 --- a/vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java +++ b/vortex/src/test/java/org/apache/iceberg/vortex/TestVortexPositionDeletes.java @@ -25,6 +25,7 @@ import java.util.List; import org.apache.iceberg.FileContent; import org.apache.iceberg.Files; +import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.data.GenericRecord; import org.apache.iceberg.data.Record; @@ -111,6 +112,51 @@ public void testPositionDeletesAppliedAlongsideRowRange() throws IOException { assertThat(ids).containsExactlyInAnyOrder(0, 1, 3, 4, 6, 8, 9); } + @Test + public void testRowPositionProjection() throws IOException { + InputFile file = writeRows(5); + + Schema projection = new Schema(SCHEMA.columns().get(0), MetadataColumns.ROW_POSITION); + + List ids = Lists.newArrayList(); + List positions = Lists.newArrayList(); + try (CloseableIterable reader = + formatModel().readBuilder(file).project(projection).build()) { + for (Record record : reader) { + ids.add((Integer) record.getField("id")); + positions.add((Long) record.getField(MetadataColumns.ROW_POSITION.name())); + } + } + + assertThat(ids).containsExactly(0, 1, 2, 3, 4); + assertThat(positions) + .as("_pos should resolve to file-relative row positions via row_idx") + .containsExactly(0L, 1L, 2L, 3L, 4L); + } + + @Test + public void testRowPositionWithPositionDeletes() throws IOException { + InputFile file = writeRows(5); + + // Delete positions 1 and 3; the surviving rows keep their original file positions. + PositionDeleteIndex deletes = + Deletes.toPositionIndex(CloseableIterable.withNoopClose(Lists.newArrayList(1L, 3L))); + + Schema projection = new Schema(SCHEMA.columns().get(0), MetadataColumns.ROW_POSITION); + + List positions = Lists.newArrayList(); + try (CloseableIterable reader = + formatModel().readBuilder(file).project(projection).positionDeletes(deletes).build()) { + for (Record record : reader) { + positions.add((Long) record.getField(MetadataColumns.ROW_POSITION.name())); + } + } + + assertThat(positions) + .as("row_idx reports absolute file positions even after deletes are excluded") + .containsExactly(0L, 2L, 4L); + } + private InputFile writeRows(int count) throws IOException { OutputFile outputFile = Files.localOutput(temp.resolve("data-" + System.nanoTime() + ".vortex").toFile()); From 9bd86c14b4f1f9a89908a73ab9b9a718243f3f31 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 15 Jun 2026 19:29:01 +0100 Subject: [PATCH 4/9] equality deletes --- .../TestVortexMergeOnReadDelete.java | 10 +- .../iceberg/spark/data/SparkVortexReader.java | 2 +- .../spark/data/SparkVortexValueReaders.java | 17 +++ .../iceberg/spark/source/BaseBatchReader.java | 93 +++++++++++++-- .../source/TestSparkVortexReaderDeletes.java | 100 +++------------- .../TestVortexMergeOnReadDelete.java | 10 +- .../iceberg/spark/data/SparkVortexReader.java | 2 +- .../spark/data/SparkVortexValueReaders.java | 17 +++ .../iceberg/spark/source/BaseBatchReader.java | 93 +++++++++++++-- .../source/TestSparkVortexReaderDeletes.java | 100 +++------------- .../TestVortexMergeOnReadDelete.java | 10 +- .../iceberg/spark/data/SparkVortexReader.java | 2 +- .../spark/data/SparkVortexValueReaders.java | 17 +++ .../iceberg/spark/source/BaseBatchReader.java | 93 +++++++++++++-- .../source/TestSparkVortexReaderDeletes.java | 112 +++--------------- 15 files changed, 381 insertions(+), 297 deletions(-) diff --git a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java index 95d21908bf7b..5224198f681f 100644 --- a/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java +++ b/spark/v3.5/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java @@ -31,9 +31,9 @@ /** * End-to-end merge-on-read DELETE coverage for Vortex tables. A DELETE reads the data with the * synthetic {@code _pos} column (wired through Vortex's {@code row_idx} expression) to compute the - * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion vector - * for v3) through the format-model registry, and the subsequent read excludes the deleted rows via - * native scan pushdown. + * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion + * vector for v3) through the format-model registry, and the subsequent read excludes the deleted + * rows via native scan pushdown. */ public class TestVortexMergeOnReadDelete extends ExtensionsTestBase { @@ -59,9 +59,7 @@ private void runMergeOnReadDelete(int formatVersion) { + "TBLPROPERTIES ('%s'='vortex', '%s'='%d', '%s'='merge-on-read')", tableName, DEFAULT_FILE_FORMAT, FORMAT_VERSION, formatVersion, DELETE_MODE); - sql( - "INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", - tableName); + sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", tableName); sql("DELETE FROM %s WHERE id IN (2, 4)", tableName); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java index bf5c97964716..736f7199f9dd 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java @@ -156,7 +156,7 @@ public VortexValueReader primitive(Type.PrimitiveType icebergType, Field prim case FLOAT -> GenericVortexReaders.floats(); case DOUBLE -> GenericVortexReaders.doubles(); case STRING -> SparkVortexValueReaders.utf8String(); - case BINARY -> GenericVortexReaders.bytes(); + case BINARY -> SparkVortexValueReaders.bytes(); case DECIMAL -> GenericVortexReaders.decimals(); case TIMESTAMP, TIMESTAMP_NANO -> { ArrowType.Timestamp ts = (ArrowType.Timestamp) primField.getType(); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java index 8ce5ce6d20c2..1a8df3c9b053 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java @@ -28,6 +28,7 @@ import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeNanoVector; import org.apache.arrow.vector.TimeStampVector; +import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -43,6 +44,11 @@ public static VortexValueReader utf8String() { return UTF8Reader.INSTANCE; } + public static VortexValueReader bytes() { + // Spark represents BinaryType as byte[], unlike the generic reader which yields a ByteBuffer. + return BytesReader.INSTANCE; + } + public static VortexValueReader date() { return DateReader.INSTANCE; } @@ -74,6 +80,17 @@ public UTF8String readNonNull(FieldVector vector, int row) { } } + static class BytesReader implements VortexValueReader { + static final BytesReader INSTANCE = new BytesReader(); + + private BytesReader() {} + + @Override + public byte[] readNonNull(FieldVector vector, int row) { + return ((VarBinaryVector) vector).get(row); + } + } + static class UuidReader implements VortexValueReader { static final UuidReader INSTANCE = new UuidReader(); diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 79d136a48b22..020c6ab37dda 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -18,6 +18,9 @@ */ package org.apache.iceberg.spark.source; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; import java.util.Map; import javax.annotation.Nonnull; import org.apache.iceberg.FileFormat; @@ -27,6 +30,8 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Table; import org.apache.iceberg.data.DeleteFilter; +import org.apache.iceberg.deletes.Deletes; +import org.apache.iceberg.deletes.PositionDeleteIndex; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.formats.ReadBuilder; @@ -35,12 +40,14 @@ import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; +import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.vectorized.ColumnVector; @@ -99,7 +106,7 @@ protected CloseableIterable newBatchIterable( // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and // is unsound once rows are filtered out during the scan) is bypassed entirely. return newPushdownBatchIterable( - readBuilder, start, length, residual, idToConstant, deleteFilter); + inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); } CloseableIterable iterable = @@ -121,10 +128,13 @@ protected CloseableIterable newBatchIterable( } // Reads from a format that applies position deletes (and residual filters) natively in the scan. - // Position deletes are pushed down so deleted rows are never materialized; only the expected - // output columns are projected. Equality deletes and the _deleted metadata column combined with - // delete files require post-scan processing that this path does not perform and are rejected. + // Every delete is turned into file-relative positions and pushed down so deleted rows are never + // materialized: position deletes directly, and equality deletes by a pre-scan that resolves the + // matching rows to positions. Only the expected output columns are projected. The _deleted + // metadata column combined with delete files requires retaining and marking rows, which this + // drop-only path does not do, so that combination is rejected. private CloseableIterable newPushdownBatchIterable( + InputFile inputFile, ReadBuilder readBuilder, long start, long length, @@ -134,13 +144,15 @@ private CloseableIterable newPushdownBatchIterable( boolean isDeletedProjected = deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); - Preconditions.checkArgument( - !deleteFilter.hasEqDeletes(), "Equality deletes are not supported for Vortex reads"); Preconditions.checkArgument( !(isDeletedProjected && hasDeletes), "The _deleted metadata column with delete files is not supported for Vortex reads"); - deleteFilter.pushablePosDeletes().ifPresent(readBuilder::positionDeletes); + PositionDeleteIndex deletePositions = + pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); + if (deletePositions.isNotEmpty()) { + readBuilder.positionDeletes(deletePositions); + } return readBuilder .project(deleteFilter.expectedSchema()) @@ -153,6 +165,73 @@ private CloseableIterable newPushdownBatchIterable( .build(); } + // Collects all rows to drop as file-relative positions: the position deletes for the file, plus + // the positions of rows matching equality deletes (resolved by a row-oriented pre-scan). + private PositionDeleteIndex pushableDeletePositions( + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + PositionDeleteIndex positions = Deletes.toPositionIndex(CloseableIterable.empty()); + + PositionDeleteIndex posDeletes = deleteFilter.deletedRowPositions(); + if (posDeletes != null) { + posDeletes.forEach(positions::delete); + } + + if (deleteFilter.hasEqDeletes()) { + addEqualityDeletePositions( + positions, inputFile, start, length, residual, idToConstant, deleteFilter); + } + + return positions; + } + + // Pre-scans the data file as rows projecting the equality-delete columns and _pos, evaluates the + // equality-delete predicate, and records the file position of every matching row. This lets + // equality deletes ride the same native position pushdown as position deletes. + private void addEqualityDeletePositions( + PositionDeleteIndex positions, + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + Schema requiredSchema = deleteFilter.requiredSchema(); + Schema scanSchema = requiredSchema; + if (requiredSchema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) { + // The equality-delete predicate binds against requiredSchema; appending _pos at the end keeps + // those columns aligned while exposing the position for each matching row. + List columns = Lists.newArrayList(requiredSchema.columns()); + columns.add(MetadataColumns.ROW_POSITION); + scanSchema = new Schema(columns); + } + int rowPositionIndex = scanSchema.columns().indexOf(MetadataColumns.ROW_POSITION); + + ReadBuilder rowReadBuilder = + FormatModelRegistry.readBuilder(FileFormat.VORTEX, InternalRow.class, inputFile); + try (CloseableIterable rows = + rowReadBuilder + .project(scanSchema) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .withNameMapping(nameMapping()) + .build(); + CloseableIterable equalityDeleted = + deleteFilter.findEqualityDeleteRows(rows)) { + for (InternalRow row : equalityDeleted) { + positions.delete(row.getLong(rowPositionIndex)); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to resolve equality-delete positions for Vortex", e); + } + } + @VisibleForTesting static class BatchDeleteFilter { private final DeleteFilter deletes; diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index c1b87cc652fd..52a21150d79b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -25,13 +25,14 @@ import org.junit.jupiter.api.TestTemplate; /** - * Exercises the Spark columnar read path for Vortex tables with position deletes. + * Exercises the Spark columnar read path for Vortex tables with deletes. * - *

Vortex applies position deletes and residual filters natively in the scan (see {@code - * BaseBatchReader.newPushdownBatchIterable}), so the position-delete cases inherited from {@link - * TestSparkReaderDeletes} run unchanged. The equality-delete and {@code _deleted}-with-delete-files - * cases are skipped: the native pushdown path intentionally does not perform the post-scan - * processing those require. + *

Vortex applies all deletes (and residual filters) natively in the scan (see {@code + * BaseBatchReader.newPushdownBatchIterable}): position deletes are pushed directly, and equality + * deletes are resolved to positions by a pre-scan and pushed as well. So the position- and + * equality-delete cases inherited from {@link TestSparkReaderDeletes} run unchanged. Only the + * {@code _deleted}-metadata-column-with-delete-files cases are skipped: that requires retaining and + * marking deleted rows, which the drop-only pushdown path does not do. */ public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { @@ -43,111 +44,48 @@ public static Object[][] parameters() { }; } - // Position deletes are dropped inside the Vortex scan, so they never reach Spark and are not - // reflected in the NumDeletes metric. Disable delete-count assertions for this path. + // Deletes are dropped inside the Vortex scan, so they never reach Spark and are not reflected in + // the NumDeletes metric. Disable delete-count assertions for this path. @Override protected boolean countDeletes() { return false; } - private static void skipUnsupported() { + private static void skipDeletedColumn() { Assumptions.abort( - "Vortex columnar reads apply position deletes and filters via native scan pushdown; " - + "equality deletes and the _deleted metadata column with delete files are not " - + "supported on this path"); - } - - // --- equality-delete cases inherited from DeleteReadTests --- - - @TestTemplate - @Override - public void testEqualityDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDateDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeletesWithRequiredEqColumn() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeletesSpanningMultipleDataFiles() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testMixedPositionAndEqualityDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testMultipleEqualityDeleteSchemas() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteByNull() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteBinaryColumn() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteStructColumn() { - skipUnsupported(); - } - - // --- equality-delete and _deleted cases from TestSparkReaderDeletes --- - - @TestTemplate - @Override - public void testEqualityDeleteWithFilter() { - skipUnsupported(); + "Vortex applies deletes via native scan pushdown (drop-only); the _deleted metadata column " + + "with delete files requires retaining and marking rows, which is not supported"); } @TestTemplate @Override public void testReadEqualityDeleteRows() { - skipUnsupported(); + // Uses EqualityDeleteRowReader with byte-range task planning; Vortex interprets split ranges as + // row positions, which is a separate limitation from equality-delete read support. + Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); } @TestTemplate @Override public void testPosDeletesWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testEqualityDeleteWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testMixedPosAndEqDeletesWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testFilterOnDeletedMetadataColumn() { - skipUnsupported(); + skipDeletedColumn(); } } diff --git a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java index 95d21908bf7b..5224198f681f 100644 --- a/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java +++ b/spark/v4.0/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java @@ -31,9 +31,9 @@ /** * End-to-end merge-on-read DELETE coverage for Vortex tables. A DELETE reads the data with the * synthetic {@code _pos} column (wired through Vortex's {@code row_idx} expression) to compute the - * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion vector - * for v3) through the format-model registry, and the subsequent read excludes the deleted rows via - * native scan pushdown. + * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion + * vector for v3) through the format-model registry, and the subsequent read excludes the deleted + * rows via native scan pushdown. */ public class TestVortexMergeOnReadDelete extends ExtensionsTestBase { @@ -59,9 +59,7 @@ private void runMergeOnReadDelete(int formatVersion) { + "TBLPROPERTIES ('%s'='vortex', '%s'='%d', '%s'='merge-on-read')", tableName, DEFAULT_FILE_FORMAT, FORMAT_VERSION, formatVersion, DELETE_MODE); - sql( - "INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", - tableName); + sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", tableName); sql("DELETE FROM %s WHERE id IN (2, 4)", tableName); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java index bf5c97964716..736f7199f9dd 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java @@ -156,7 +156,7 @@ public VortexValueReader primitive(Type.PrimitiveType icebergType, Field prim case FLOAT -> GenericVortexReaders.floats(); case DOUBLE -> GenericVortexReaders.doubles(); case STRING -> SparkVortexValueReaders.utf8String(); - case BINARY -> GenericVortexReaders.bytes(); + case BINARY -> SparkVortexValueReaders.bytes(); case DECIMAL -> GenericVortexReaders.decimals(); case TIMESTAMP, TIMESTAMP_NANO -> { ArrowType.Timestamp ts = (ArrowType.Timestamp) primField.getType(); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java index 8ce5ce6d20c2..1a8df3c9b053 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java @@ -28,6 +28,7 @@ import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeNanoVector; import org.apache.arrow.vector.TimeStampVector; +import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -43,6 +44,11 @@ public static VortexValueReader utf8String() { return UTF8Reader.INSTANCE; } + public static VortexValueReader bytes() { + // Spark represents BinaryType as byte[], unlike the generic reader which yields a ByteBuffer. + return BytesReader.INSTANCE; + } + public static VortexValueReader date() { return DateReader.INSTANCE; } @@ -74,6 +80,17 @@ public UTF8String readNonNull(FieldVector vector, int row) { } } + static class BytesReader implements VortexValueReader { + static final BytesReader INSTANCE = new BytesReader(); + + private BytesReader() {} + + @Override + public byte[] readNonNull(FieldVector vector, int row) { + return ((VarBinaryVector) vector).get(row); + } + } + static class UuidReader implements VortexValueReader { static final UuidReader INSTANCE = new UuidReader(); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 79d136a48b22..020c6ab37dda 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -18,6 +18,9 @@ */ package org.apache.iceberg.spark.source; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; import java.util.Map; import javax.annotation.Nonnull; import org.apache.iceberg.FileFormat; @@ -27,6 +30,8 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Table; import org.apache.iceberg.data.DeleteFilter; +import org.apache.iceberg.deletes.Deletes; +import org.apache.iceberg.deletes.PositionDeleteIndex; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.formats.ReadBuilder; @@ -35,12 +40,14 @@ import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; +import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.vectorized.ColumnVector; @@ -99,7 +106,7 @@ protected CloseableIterable newBatchIterable( // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and // is unsound once rows are filtered out during the scan) is bypassed entirely. return newPushdownBatchIterable( - readBuilder, start, length, residual, idToConstant, deleteFilter); + inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); } CloseableIterable iterable = @@ -121,10 +128,13 @@ protected CloseableIterable newBatchIterable( } // Reads from a format that applies position deletes (and residual filters) natively in the scan. - // Position deletes are pushed down so deleted rows are never materialized; only the expected - // output columns are projected. Equality deletes and the _deleted metadata column combined with - // delete files require post-scan processing that this path does not perform and are rejected. + // Every delete is turned into file-relative positions and pushed down so deleted rows are never + // materialized: position deletes directly, and equality deletes by a pre-scan that resolves the + // matching rows to positions. Only the expected output columns are projected. The _deleted + // metadata column combined with delete files requires retaining and marking rows, which this + // drop-only path does not do, so that combination is rejected. private CloseableIterable newPushdownBatchIterable( + InputFile inputFile, ReadBuilder readBuilder, long start, long length, @@ -134,13 +144,15 @@ private CloseableIterable newPushdownBatchIterable( boolean isDeletedProjected = deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); - Preconditions.checkArgument( - !deleteFilter.hasEqDeletes(), "Equality deletes are not supported for Vortex reads"); Preconditions.checkArgument( !(isDeletedProjected && hasDeletes), "The _deleted metadata column with delete files is not supported for Vortex reads"); - deleteFilter.pushablePosDeletes().ifPresent(readBuilder::positionDeletes); + PositionDeleteIndex deletePositions = + pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); + if (deletePositions.isNotEmpty()) { + readBuilder.positionDeletes(deletePositions); + } return readBuilder .project(deleteFilter.expectedSchema()) @@ -153,6 +165,73 @@ private CloseableIterable newPushdownBatchIterable( .build(); } + // Collects all rows to drop as file-relative positions: the position deletes for the file, plus + // the positions of rows matching equality deletes (resolved by a row-oriented pre-scan). + private PositionDeleteIndex pushableDeletePositions( + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + PositionDeleteIndex positions = Deletes.toPositionIndex(CloseableIterable.empty()); + + PositionDeleteIndex posDeletes = deleteFilter.deletedRowPositions(); + if (posDeletes != null) { + posDeletes.forEach(positions::delete); + } + + if (deleteFilter.hasEqDeletes()) { + addEqualityDeletePositions( + positions, inputFile, start, length, residual, idToConstant, deleteFilter); + } + + return positions; + } + + // Pre-scans the data file as rows projecting the equality-delete columns and _pos, evaluates the + // equality-delete predicate, and records the file position of every matching row. This lets + // equality deletes ride the same native position pushdown as position deletes. + private void addEqualityDeletePositions( + PositionDeleteIndex positions, + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + Schema requiredSchema = deleteFilter.requiredSchema(); + Schema scanSchema = requiredSchema; + if (requiredSchema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) { + // The equality-delete predicate binds against requiredSchema; appending _pos at the end keeps + // those columns aligned while exposing the position for each matching row. + List columns = Lists.newArrayList(requiredSchema.columns()); + columns.add(MetadataColumns.ROW_POSITION); + scanSchema = new Schema(columns); + } + int rowPositionIndex = scanSchema.columns().indexOf(MetadataColumns.ROW_POSITION); + + ReadBuilder rowReadBuilder = + FormatModelRegistry.readBuilder(FileFormat.VORTEX, InternalRow.class, inputFile); + try (CloseableIterable rows = + rowReadBuilder + .project(scanSchema) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .withNameMapping(nameMapping()) + .build(); + CloseableIterable equalityDeleted = + deleteFilter.findEqualityDeleteRows(rows)) { + for (InternalRow row : equalityDeleted) { + positions.delete(row.getLong(rowPositionIndex)); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to resolve equality-delete positions for Vortex", e); + } + } + @VisibleForTesting static class BatchDeleteFilter { private final DeleteFilter deletes; diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index c1b87cc652fd..52a21150d79b 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -25,13 +25,14 @@ import org.junit.jupiter.api.TestTemplate; /** - * Exercises the Spark columnar read path for Vortex tables with position deletes. + * Exercises the Spark columnar read path for Vortex tables with deletes. * - *

Vortex applies position deletes and residual filters natively in the scan (see {@code - * BaseBatchReader.newPushdownBatchIterable}), so the position-delete cases inherited from {@link - * TestSparkReaderDeletes} run unchanged. The equality-delete and {@code _deleted}-with-delete-files - * cases are skipped: the native pushdown path intentionally does not perform the post-scan - * processing those require. + *

Vortex applies all deletes (and residual filters) natively in the scan (see {@code + * BaseBatchReader.newPushdownBatchIterable}): position deletes are pushed directly, and equality + * deletes are resolved to positions by a pre-scan and pushed as well. So the position- and + * equality-delete cases inherited from {@link TestSparkReaderDeletes} run unchanged. Only the + * {@code _deleted}-metadata-column-with-delete-files cases are skipped: that requires retaining and + * marking deleted rows, which the drop-only pushdown path does not do. */ public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { @@ -43,111 +44,48 @@ public static Object[][] parameters() { }; } - // Position deletes are dropped inside the Vortex scan, so they never reach Spark and are not - // reflected in the NumDeletes metric. Disable delete-count assertions for this path. + // Deletes are dropped inside the Vortex scan, so they never reach Spark and are not reflected in + // the NumDeletes metric. Disable delete-count assertions for this path. @Override protected boolean countDeletes() { return false; } - private static void skipUnsupported() { + private static void skipDeletedColumn() { Assumptions.abort( - "Vortex columnar reads apply position deletes and filters via native scan pushdown; " - + "equality deletes and the _deleted metadata column with delete files are not " - + "supported on this path"); - } - - // --- equality-delete cases inherited from DeleteReadTests --- - - @TestTemplate - @Override - public void testEqualityDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDateDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeletesWithRequiredEqColumn() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeletesSpanningMultipleDataFiles() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testMixedPositionAndEqualityDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testMultipleEqualityDeleteSchemas() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteByNull() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteBinaryColumn() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteStructColumn() { - skipUnsupported(); - } - - // --- equality-delete and _deleted cases from TestSparkReaderDeletes --- - - @TestTemplate - @Override - public void testEqualityDeleteWithFilter() { - skipUnsupported(); + "Vortex applies deletes via native scan pushdown (drop-only); the _deleted metadata column " + + "with delete files requires retaining and marking rows, which is not supported"); } @TestTemplate @Override public void testReadEqualityDeleteRows() { - skipUnsupported(); + // Uses EqualityDeleteRowReader with byte-range task planning; Vortex interprets split ranges as + // row positions, which is a separate limitation from equality-delete read support. + Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); } @TestTemplate @Override public void testPosDeletesWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testEqualityDeleteWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testMixedPosAndEqDeletesWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testFilterOnDeletedMetadataColumn() { - skipUnsupported(); + skipDeletedColumn(); } } diff --git a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java index 95d21908bf7b..5224198f681f 100644 --- a/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java +++ b/spark/v4.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestVortexMergeOnReadDelete.java @@ -31,9 +31,9 @@ /** * End-to-end merge-on-read DELETE coverage for Vortex tables. A DELETE reads the data with the * synthetic {@code _pos} column (wired through Vortex's {@code row_idx} expression) to compute the - * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion vector - * for v3) through the format-model registry, and the subsequent read excludes the deleted rows via - * native scan pushdown. + * positions to delete, writes a position-delete file (a Vortex delete file for v2, a deletion + * vector for v3) through the format-model registry, and the subsequent read excludes the deleted + * rows via native scan pushdown. */ public class TestVortexMergeOnReadDelete extends ExtensionsTestBase { @@ -59,9 +59,7 @@ private void runMergeOnReadDelete(int formatVersion) { + "TBLPROPERTIES ('%s'='vortex', '%s'='%d', '%s'='merge-on-read')", tableName, DEFAULT_FILE_FORMAT, FORMAT_VERSION, formatVersion, DELETE_MODE); - sql( - "INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", - tableName); + sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')", tableName); sql("DELETE FROM %s WHERE id IN (2, 4)", tableName); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java index bf5c97964716..736f7199f9dd 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexReader.java @@ -156,7 +156,7 @@ public VortexValueReader primitive(Type.PrimitiveType icebergType, Field prim case FLOAT -> GenericVortexReaders.floats(); case DOUBLE -> GenericVortexReaders.doubles(); case STRING -> SparkVortexValueReaders.utf8String(); - case BINARY -> GenericVortexReaders.bytes(); + case BINARY -> SparkVortexValueReaders.bytes(); case DECIMAL -> GenericVortexReaders.decimals(); case TIMESTAMP, TIMESTAMP_NANO -> { ArrowType.Timestamp ts = (ArrowType.Timestamp) primField.getType(); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java index 8ce5ce6d20c2..1a8df3c9b053 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/data/SparkVortexValueReaders.java @@ -28,6 +28,7 @@ import org.apache.arrow.vector.TimeMicroVector; import org.apache.arrow.vector.TimeNanoVector; import org.apache.arrow.vector.TimeStampVector; +import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; @@ -43,6 +44,11 @@ public static VortexValueReader utf8String() { return UTF8Reader.INSTANCE; } + public static VortexValueReader bytes() { + // Spark represents BinaryType as byte[], unlike the generic reader which yields a ByteBuffer. + return BytesReader.INSTANCE; + } + public static VortexValueReader date() { return DateReader.INSTANCE; } @@ -74,6 +80,17 @@ public UTF8String readNonNull(FieldVector vector, int row) { } } + static class BytesReader implements VortexValueReader { + static final BytesReader INSTANCE = new BytesReader(); + + private BytesReader() {} + + @Override + public byte[] readNonNull(FieldVector vector, int row) { + return ((VarBinaryVector) vector).get(row); + } + } + static class UuidReader implements VortexValueReader { static final UuidReader INSTANCE = new UuidReader(); diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 8ff2e280fef8..b86f8f984163 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -18,6 +18,9 @@ */ package org.apache.iceberg.spark.source; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.List; import java.util.Map; import javax.annotation.Nonnull; import org.apache.iceberg.FileFormat; @@ -27,6 +30,8 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Table; import org.apache.iceberg.data.DeleteFilter; +import org.apache.iceberg.deletes.Deletes; +import org.apache.iceberg.deletes.PositionDeleteIndex; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.formats.ReadBuilder; @@ -35,12 +40,14 @@ import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; +import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.vectorized.ColumnVector; @@ -91,7 +98,7 @@ protected CloseableIterable newBatchIterable( // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and // is unsound once rows are filtered out during the scan) is bypassed entirely. return newPushdownBatchIterable( - readBuilder, start, length, residual, idToConstant, deleteFilter); + inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); } CloseableIterable iterable = @@ -113,10 +120,13 @@ protected CloseableIterable newBatchIterable( } // Reads from a format that applies position deletes (and residual filters) natively in the scan. - // Position deletes are pushed down so deleted rows are never materialized; only the expected - // output columns are projected. Equality deletes and the _deleted metadata column combined with - // delete files require post-scan processing that this path does not perform and are rejected. + // Every delete is turned into file-relative positions and pushed down so deleted rows are never + // materialized: position deletes directly, and equality deletes by a pre-scan that resolves the + // matching rows to positions. Only the expected output columns are projected. The _deleted + // metadata column combined with delete files requires retaining and marking rows, which this + // drop-only path does not do, so that combination is rejected. private CloseableIterable newPushdownBatchIterable( + InputFile inputFile, ReadBuilder readBuilder, long start, long length, @@ -126,13 +136,15 @@ private CloseableIterable newPushdownBatchIterable( boolean isDeletedProjected = deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); - Preconditions.checkArgument( - !deleteFilter.hasEqDeletes(), "Equality deletes are not supported for Vortex reads"); Preconditions.checkArgument( !(isDeletedProjected && hasDeletes), "The _deleted metadata column with delete files is not supported for Vortex reads"); - deleteFilter.pushablePosDeletes().ifPresent(readBuilder::positionDeletes); + PositionDeleteIndex deletePositions = + pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); + if (deletePositions.isNotEmpty()) { + readBuilder.positionDeletes(deletePositions); + } return readBuilder .project(deleteFilter.expectedSchema()) @@ -145,6 +157,73 @@ private CloseableIterable newPushdownBatchIterable( .build(); } + // Collects all rows to drop as file-relative positions: the position deletes for the file, plus + // the positions of rows matching equality deletes (resolved by a row-oriented pre-scan). + private PositionDeleteIndex pushableDeletePositions( + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + PositionDeleteIndex positions = Deletes.toPositionIndex(CloseableIterable.empty()); + + PositionDeleteIndex posDeletes = deleteFilter.deletedRowPositions(); + if (posDeletes != null) { + posDeletes.forEach(positions::delete); + } + + if (deleteFilter.hasEqDeletes()) { + addEqualityDeletePositions( + positions, inputFile, start, length, residual, idToConstant, deleteFilter); + } + + return positions; + } + + // Pre-scans the data file as rows projecting the equality-delete columns and _pos, evaluates the + // equality-delete predicate, and records the file position of every matching row. This lets + // equality deletes ride the same native position pushdown as position deletes. + private void addEqualityDeletePositions( + PositionDeleteIndex positions, + InputFile inputFile, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + Schema requiredSchema = deleteFilter.requiredSchema(); + Schema scanSchema = requiredSchema; + if (requiredSchema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) { + // The equality-delete predicate binds against requiredSchema; appending _pos at the end keeps + // those columns aligned while exposing the position for each matching row. + List columns = Lists.newArrayList(requiredSchema.columns()); + columns.add(MetadataColumns.ROW_POSITION); + scanSchema = new Schema(columns); + } + int rowPositionIndex = scanSchema.columns().indexOf(MetadataColumns.ROW_POSITION); + + ReadBuilder rowReadBuilder = + FormatModelRegistry.readBuilder(FileFormat.VORTEX, InternalRow.class, inputFile); + try (CloseableIterable rows = + rowReadBuilder + .project(scanSchema) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .withNameMapping(nameMapping()) + .build(); + CloseableIterable equalityDeleted = + deleteFilter.findEqualityDeleteRows(rows)) { + for (InternalRow row : equalityDeleted) { + positions.delete(row.getLong(rowPositionIndex)); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to resolve equality-delete positions for Vortex", e); + } + } + @VisibleForTesting static class BatchDeleteFilter { private final DeleteFilter deletes; diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index f77a9189cc25..52a21150d79b 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -25,13 +25,14 @@ import org.junit.jupiter.api.TestTemplate; /** - * Exercises the Spark columnar read path for Vortex tables with position deletes. + * Exercises the Spark columnar read path for Vortex tables with deletes. * - *

Vortex applies position deletes and residual filters natively in the scan (see {@code - * BaseBatchReader.newPushdownBatchIterable}), so the position-delete cases inherited from {@link - * TestSparkReaderDeletes} run unchanged. The equality-delete and {@code _deleted}-with-delete-files - * cases are skipped: the native pushdown path intentionally does not perform the post-scan - * processing those require. + *

Vortex applies all deletes (and residual filters) natively in the scan (see {@code + * BaseBatchReader.newPushdownBatchIterable}): position deletes are pushed directly, and equality + * deletes are resolved to positions by a pre-scan and pushed as well. So the position- and + * equality-delete cases inherited from {@link TestSparkReaderDeletes} run unchanged. Only the + * {@code _deleted}-metadata-column-with-delete-files cases are skipped: that requires retaining and + * marking deleted rows, which the drop-only pushdown path does not do. */ public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { @@ -43,123 +44,48 @@ public static Object[][] parameters() { }; } - // Position deletes are dropped inside the Vortex scan, so they never reach Spark and are not - // reflected in the NumDeletes metric. Disable delete-count assertions for this path. + // Deletes are dropped inside the Vortex scan, so they never reach Spark and are not reflected in + // the NumDeletes metric. Disable delete-count assertions for this path. @Override protected boolean countDeletes() { return false; } - private static void skipUnsupported() { + private static void skipDeletedColumn() { Assumptions.abort( - "Vortex columnar reads apply position deletes and filters via native scan pushdown; " - + "equality deletes and the _deleted metadata column with delete files are not " - + "supported on this path"); - } - - // --- equality-delete cases inherited from DeleteReadTests --- - - @TestTemplate - @Override - public void testEqualityDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDateDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeletesWithRequiredEqColumn() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeletesSpanningMultipleDataFiles() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testMixedPositionAndEqualityDeletes() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testMultipleEqualityDeleteSchemas() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteByNull() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteBinaryColumn() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteStructColumn() { - skipUnsupported(); - } - - // --- equality-delete and _deleted cases from TestSparkReaderDeletes --- - - @TestTemplate - @Override - public void testEqualityDeleteWithFilter() { - skipUnsupported(); + "Vortex applies deletes via native scan pushdown (drop-only); the _deleted metadata column " + + "with delete files requires retaining and marking rows, which is not supported"); } @TestTemplate @Override public void testReadEqualityDeleteRows() { - skipUnsupported(); + // Uses EqualityDeleteRowReader with byte-range task planning; Vortex interprets split ranges as + // row positions, which is a separate limitation from equality-delete read support. + Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); } @TestTemplate @Override public void testPosDeletesWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testEqualityDeleteWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testMixedPosAndEqDeletesWithDeletedColumn() { - skipUnsupported(); + skipDeletedColumn(); } @TestTemplate @Override public void testFilterOnDeletedMetadataColumn() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeleteWithSchemaEvolution() { - skipUnsupported(); - } - - @TestTemplate - @Override - public void testEqualityDeletesAppliedWithCachedFieldReordering() { - skipUnsupported(); + skipDeletedColumn(); } } From e351fc99af0e0ac96a168f3749704bed6c10e460 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Mon, 15 Jun 2026 21:40:42 +0100 Subject: [PATCH 5/9] more --- .../iceberg/spark/source/BaseBatchReader.java | 110 +++++++++++++++--- .../source/TestSparkVortexReaderDeletes.java | 46 ++------ .../iceberg/spark/source/BaseBatchReader.java | 110 +++++++++++++++--- .../source/TestSparkVortexReaderDeletes.java | 46 ++------ .../iceberg/spark/source/BaseBatchReader.java | 110 +++++++++++++++--- .../source/TestSparkVortexReaderDeletes.java | 46 ++------ 6 files changed, 303 insertions(+), 165 deletions(-) diff --git a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 020c6ab37dda..57226e94448d 100644 --- a/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -39,13 +39,13 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; +import org.apache.iceberg.spark.data.vectorized.DeletedColumnVector; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; @@ -102,9 +102,20 @@ protected CloseableIterable newBatchIterable( } if (readBuilder.supportsPositionDeletes()) { - // Vortex applies position deletes and residual filters natively inside the scan, so the - // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and - // is unsound once rows are filtered out during the scan) is bypassed entirely. + // Vortex applies deletes (and residual filters) natively in the scan, so the post-scan + // BatchDeleteFilter (which derives positions from a contiguous _pos column and is unsound + // once + // rows are filtered out during the scan) is bypassed. Deleted rows are normally dropped via a + // pushed position bitmap; but when the _deleted metadata column is projected they must + // instead + // be retained and flagged, so nothing is pushed and rows are marked from their _pos. + boolean markDeletes = + deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null + && (deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes()); + if (markDeletes) { + return newDeleteMarkingBatchIterable( + inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); + } return newPushdownBatchIterable( inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); } @@ -127,12 +138,9 @@ protected CloseableIterable newBatchIterable( return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); } - // Reads from a format that applies position deletes (and residual filters) natively in the scan. - // Every delete is turned into file-relative positions and pushed down so deleted rows are never - // materialized: position deletes directly, and equality deletes by a pre-scan that resolves the - // matching rows to positions. Only the expected output columns are projected. The _deleted - // metadata column combined with delete files requires retaining and marking rows, which this - // drop-only path does not do, so that combination is rejected. + // Drops deleted rows by turning every delete into file-relative positions and pushing them into + // the scan so they are never materialized: position deletes directly, and equality deletes via a + // pre-scan that resolves the matching rows to positions. Only the expected columns are projected. private CloseableIterable newPushdownBatchIterable( InputFile inputFile, ReadBuilder readBuilder, @@ -141,13 +149,6 @@ private CloseableIterable newPushdownBatchIterable( Expression residual, Map idToConstant, SparkDeleteFilter deleteFilter) { - boolean isDeletedProjected = - deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; - boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); - Preconditions.checkArgument( - !(isDeletedProjected && hasDeletes), - "The _deleted metadata column with delete files is not supported for Vortex reads"); - PositionDeleteIndex deletePositions = pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); if (deletePositions.isNotEmpty()) { @@ -165,6 +166,81 @@ private CloseableIterable newPushdownBatchIterable( .build(); } + // Retains all rows and flags deleted ones in the _deleted column instead of dropping them (for + // CDC-style reads that project _deleted). Nothing is pushed into the scan; rows are marked using + // their actual _pos, which stays correct even when a residual filter makes positions + // non-contiguous within a batch. + private CloseableIterable newDeleteMarkingBatchIterable( + InputFile inputFile, + ReadBuilder readBuilder, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + PositionDeleteIndex deletePositions = + pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); + + Schema expectedSchema = deleteFilter.expectedSchema(); + Schema scanSchema = expectedSchema; + if (expectedSchema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) { + // _pos is needed to look up the delete state of each row; append it when not already + // projected + // and trim it back off the emitted batch. + List columns = Lists.newArrayList(expectedSchema.columns()); + columns.add(MetadataColumns.ROW_POSITION); + scanSchema = new Schema(columns); + } + int rowPositionIndex = scanSchema.columns().indexOf(MetadataColumns.ROW_POSITION); + int isDeletedIndex = scanSchema.columns().indexOf(MetadataColumns.IS_DELETED); + int outputColumnCount = expectedSchema.columns().size(); + + CloseableIterable iterable = + readBuilder + .project(scanSchema) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .reuseContainers() + .withNameMapping(nameMapping()) + .build(); + + return CloseableIterable.transform( + iterable, + batch -> + markDeletedRows( + batch, deletePositions, rowPositionIndex, isDeletedIndex, outputColumnCount)); + } + + private static ColumnarBatch markDeletedRows( + ColumnarBatch batch, + PositionDeleteIndex deletePositions, + int rowPositionIndex, + int isDeletedIndex, + int outputColumnCount) { + int numRows = batch.numRows(); + ColumnVector rowPositions = batch.column(rowPositionIndex); + boolean[] isDeleted = new boolean[numRows]; + for (int row = 0; row < numRows; row++) { + isDeleted[row] = deletePositions.isDeleted(rowPositions.getLong(row)); + } + + DeletedColumnVector deletedColumn = new DeletedColumnVector(Types.BooleanType.get()); + deletedColumn.setValue(isDeleted); + + // Emit only the expected columns (_pos is dropped when it was appended just for marking) with + // the constant _deleted column replaced by the computed flags. + ColumnVector[] vectors = new ColumnVector[outputColumnCount]; + for (int i = 0; i < outputColumnCount; i++) { + vectors[i] = i == isDeletedIndex ? deletedColumn : batch.column(i); + } + + ColumnarBatch output = new ColumnarBatch(vectors); + output.setNumRows(numRows); + return output; + } + // Collects all rows to drop as file-relative positions: the position deletes for the file, plus // the positions of rows matching equality deletes (resolved by a row-oriented pre-scan). private PositionDeleteIndex pushableDeletePositions( diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index 52a21150d79b..f45e6a3d6953 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -27,12 +27,12 @@ /** * Exercises the Spark columnar read path for Vortex tables with deletes. * - *

Vortex applies all deletes (and residual filters) natively in the scan (see {@code - * BaseBatchReader.newPushdownBatchIterable}): position deletes are pushed directly, and equality - * deletes are resolved to positions by a pre-scan and pushed as well. So the position- and - * equality-delete cases inherited from {@link TestSparkReaderDeletes} run unchanged. Only the - * {@code _deleted}-metadata-column-with-delete-files cases are skipped: that requires retaining and - * marking deleted rows, which the drop-only pushdown path does not do. + *

Vortex applies deletes (and residual filters) natively in the scan (see {@code + * BaseBatchReader}): when deleted rows are dropped, position deletes are pushed directly and + * equality deletes are resolved to positions by a pre-scan; when the {@code _deleted} column is + * projected, rows are retained and flagged from their {@code _pos} instead. So the position-, + * equality-, and {@code _deleted}-delete cases inherited from {@link TestSparkReaderDeletes} all + * run. */ public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { @@ -44,19 +44,13 @@ public static Object[][] parameters() { }; } - // Deletes are dropped inside the Vortex scan, so they never reach Spark and are not reflected in - // the NumDeletes metric. Disable delete-count assertions for this path. + // Deletes are applied inside the Vortex scan (or marked from _pos), so they never reach Spark and + // are not reflected in the NumDeletes metric. Disable delete-count assertions for this path. @Override protected boolean countDeletes() { return false; } - private static void skipDeletedColumn() { - Assumptions.abort( - "Vortex applies deletes via native scan pushdown (drop-only); the _deleted metadata column " - + "with delete files requires retaining and marking rows, which is not supported"); - } - @TestTemplate @Override public void testReadEqualityDeleteRows() { @@ -64,28 +58,4 @@ public void testReadEqualityDeleteRows() { // row positions, which is a separate limitation from equality-delete read support. Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); } - - @TestTemplate - @Override - public void testPosDeletesWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testEqualityDeleteWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testMixedPosAndEqDeletesWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testFilterOnDeletedMetadataColumn() { - skipDeletedColumn(); - } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 020c6ab37dda..57226e94448d 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -39,13 +39,13 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; +import org.apache.iceberg.spark.data.vectorized.DeletedColumnVector; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; @@ -102,9 +102,20 @@ protected CloseableIterable newBatchIterable( } if (readBuilder.supportsPositionDeletes()) { - // Vortex applies position deletes and residual filters natively inside the scan, so the - // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and - // is unsound once rows are filtered out during the scan) is bypassed entirely. + // Vortex applies deletes (and residual filters) natively in the scan, so the post-scan + // BatchDeleteFilter (which derives positions from a contiguous _pos column and is unsound + // once + // rows are filtered out during the scan) is bypassed. Deleted rows are normally dropped via a + // pushed position bitmap; but when the _deleted metadata column is projected they must + // instead + // be retained and flagged, so nothing is pushed and rows are marked from their _pos. + boolean markDeletes = + deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null + && (deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes()); + if (markDeletes) { + return newDeleteMarkingBatchIterable( + inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); + } return newPushdownBatchIterable( inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); } @@ -127,12 +138,9 @@ protected CloseableIterable newBatchIterable( return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); } - // Reads from a format that applies position deletes (and residual filters) natively in the scan. - // Every delete is turned into file-relative positions and pushed down so deleted rows are never - // materialized: position deletes directly, and equality deletes by a pre-scan that resolves the - // matching rows to positions. Only the expected output columns are projected. The _deleted - // metadata column combined with delete files requires retaining and marking rows, which this - // drop-only path does not do, so that combination is rejected. + // Drops deleted rows by turning every delete into file-relative positions and pushing them into + // the scan so they are never materialized: position deletes directly, and equality deletes via a + // pre-scan that resolves the matching rows to positions. Only the expected columns are projected. private CloseableIterable newPushdownBatchIterable( InputFile inputFile, ReadBuilder readBuilder, @@ -141,13 +149,6 @@ private CloseableIterable newPushdownBatchIterable( Expression residual, Map idToConstant, SparkDeleteFilter deleteFilter) { - boolean isDeletedProjected = - deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; - boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); - Preconditions.checkArgument( - !(isDeletedProjected && hasDeletes), - "The _deleted metadata column with delete files is not supported for Vortex reads"); - PositionDeleteIndex deletePositions = pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); if (deletePositions.isNotEmpty()) { @@ -165,6 +166,81 @@ private CloseableIterable newPushdownBatchIterable( .build(); } + // Retains all rows and flags deleted ones in the _deleted column instead of dropping them (for + // CDC-style reads that project _deleted). Nothing is pushed into the scan; rows are marked using + // their actual _pos, which stays correct even when a residual filter makes positions + // non-contiguous within a batch. + private CloseableIterable newDeleteMarkingBatchIterable( + InputFile inputFile, + ReadBuilder readBuilder, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + PositionDeleteIndex deletePositions = + pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); + + Schema expectedSchema = deleteFilter.expectedSchema(); + Schema scanSchema = expectedSchema; + if (expectedSchema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) { + // _pos is needed to look up the delete state of each row; append it when not already + // projected + // and trim it back off the emitted batch. + List columns = Lists.newArrayList(expectedSchema.columns()); + columns.add(MetadataColumns.ROW_POSITION); + scanSchema = new Schema(columns); + } + int rowPositionIndex = scanSchema.columns().indexOf(MetadataColumns.ROW_POSITION); + int isDeletedIndex = scanSchema.columns().indexOf(MetadataColumns.IS_DELETED); + int outputColumnCount = expectedSchema.columns().size(); + + CloseableIterable iterable = + readBuilder + .project(scanSchema) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .reuseContainers() + .withNameMapping(nameMapping()) + .build(); + + return CloseableIterable.transform( + iterable, + batch -> + markDeletedRows( + batch, deletePositions, rowPositionIndex, isDeletedIndex, outputColumnCount)); + } + + private static ColumnarBatch markDeletedRows( + ColumnarBatch batch, + PositionDeleteIndex deletePositions, + int rowPositionIndex, + int isDeletedIndex, + int outputColumnCount) { + int numRows = batch.numRows(); + ColumnVector rowPositions = batch.column(rowPositionIndex); + boolean[] isDeleted = new boolean[numRows]; + for (int row = 0; row < numRows; row++) { + isDeleted[row] = deletePositions.isDeleted(rowPositions.getLong(row)); + } + + DeletedColumnVector deletedColumn = new DeletedColumnVector(Types.BooleanType.get()); + deletedColumn.setValue(isDeleted); + + // Emit only the expected columns (_pos is dropped when it was appended just for marking) with + // the constant _deleted column replaced by the computed flags. + ColumnVector[] vectors = new ColumnVector[outputColumnCount]; + for (int i = 0; i < outputColumnCount; i++) { + vectors[i] = i == isDeletedIndex ? deletedColumn : batch.column(i); + } + + ColumnarBatch output = new ColumnarBatch(vectors); + output.setNumRows(numRows); + return output; + } + // Collects all rows to drop as file-relative positions: the position deletes for the file, plus // the positions of rows matching equality deletes (resolved by a row-oriented pre-scan). private PositionDeleteIndex pushableDeletePositions( diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index 52a21150d79b..f45e6a3d6953 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -27,12 +27,12 @@ /** * Exercises the Spark columnar read path for Vortex tables with deletes. * - *

Vortex applies all deletes (and residual filters) natively in the scan (see {@code - * BaseBatchReader.newPushdownBatchIterable}): position deletes are pushed directly, and equality - * deletes are resolved to positions by a pre-scan and pushed as well. So the position- and - * equality-delete cases inherited from {@link TestSparkReaderDeletes} run unchanged. Only the - * {@code _deleted}-metadata-column-with-delete-files cases are skipped: that requires retaining and - * marking deleted rows, which the drop-only pushdown path does not do. + *

Vortex applies deletes (and residual filters) natively in the scan (see {@code + * BaseBatchReader}): when deleted rows are dropped, position deletes are pushed directly and + * equality deletes are resolved to positions by a pre-scan; when the {@code _deleted} column is + * projected, rows are retained and flagged from their {@code _pos} instead. So the position-, + * equality-, and {@code _deleted}-delete cases inherited from {@link TestSparkReaderDeletes} all + * run. */ public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { @@ -44,19 +44,13 @@ public static Object[][] parameters() { }; } - // Deletes are dropped inside the Vortex scan, so they never reach Spark and are not reflected in - // the NumDeletes metric. Disable delete-count assertions for this path. + // Deletes are applied inside the Vortex scan (or marked from _pos), so they never reach Spark and + // are not reflected in the NumDeletes metric. Disable delete-count assertions for this path. @Override protected boolean countDeletes() { return false; } - private static void skipDeletedColumn() { - Assumptions.abort( - "Vortex applies deletes via native scan pushdown (drop-only); the _deleted metadata column " - + "with delete files requires retaining and marking rows, which is not supported"); - } - @TestTemplate @Override public void testReadEqualityDeleteRows() { @@ -64,28 +58,4 @@ public void testReadEqualityDeleteRows() { // row positions, which is a separate limitation from equality-delete read support. Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); } - - @TestTemplate - @Override - public void testPosDeletesWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testEqualityDeleteWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testMixedPosAndEqDeletesWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testFilterOnDeletedMetadataColumn() { - skipDeletedColumn(); - } } diff --git a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index b86f8f984163..045c256f0ffb 100644 --- a/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.1/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -39,13 +39,13 @@ import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; import org.apache.iceberg.spark.VortexBatchReadConf; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; +import org.apache.iceberg.spark.data.vectorized.DeletedColumnVector; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; @@ -94,9 +94,20 @@ protected CloseableIterable newBatchIterable( } if (readBuilder.supportsPositionDeletes()) { - // Vortex applies position deletes and residual filters natively inside the scan, so the - // post-scan BatchDeleteFilter (which derives row positions from a contiguous _pos column and - // is unsound once rows are filtered out during the scan) is bypassed entirely. + // Vortex applies deletes (and residual filters) natively in the scan, so the post-scan + // BatchDeleteFilter (which derives positions from a contiguous _pos column and is unsound + // once + // rows are filtered out during the scan) is bypassed. Deleted rows are normally dropped via a + // pushed position bitmap; but when the _deleted metadata column is projected they must + // instead + // be retained and flagged, so nothing is pushed and rows are marked from their _pos. + boolean markDeletes = + deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null + && (deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes()); + if (markDeletes) { + return newDeleteMarkingBatchIterable( + inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); + } return newPushdownBatchIterable( inputFile, readBuilder, start, length, residual, idToConstant, deleteFilter); } @@ -119,12 +130,9 @@ protected CloseableIterable newBatchIterable( return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); } - // Reads from a format that applies position deletes (and residual filters) natively in the scan. - // Every delete is turned into file-relative positions and pushed down so deleted rows are never - // materialized: position deletes directly, and equality deletes by a pre-scan that resolves the - // matching rows to positions. Only the expected output columns are projected. The _deleted - // metadata column combined with delete files requires retaining and marking rows, which this - // drop-only path does not do, so that combination is rejected. + // Drops deleted rows by turning every delete into file-relative positions and pushing them into + // the scan so they are never materialized: position deletes directly, and equality deletes via a + // pre-scan that resolves the matching rows to positions. Only the expected columns are projected. private CloseableIterable newPushdownBatchIterable( InputFile inputFile, ReadBuilder readBuilder, @@ -133,13 +141,6 @@ private CloseableIterable newPushdownBatchIterable( Expression residual, Map idToConstant, SparkDeleteFilter deleteFilter) { - boolean isDeletedProjected = - deleteFilter.requiredSchema().findField(MetadataColumns.IS_DELETED.fieldId()) != null; - boolean hasDeletes = deleteFilter.hasPosDeletes() || deleteFilter.hasEqDeletes(); - Preconditions.checkArgument( - !(isDeletedProjected && hasDeletes), - "The _deleted metadata column with delete files is not supported for Vortex reads"); - PositionDeleteIndex deletePositions = pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); if (deletePositions.isNotEmpty()) { @@ -157,6 +158,81 @@ private CloseableIterable newPushdownBatchIterable( .build(); } + // Retains all rows and flags deleted ones in the _deleted column instead of dropping them (for + // CDC-style reads that project _deleted). Nothing is pushed into the scan; rows are marked using + // their actual _pos, which stays correct even when a residual filter makes positions + // non-contiguous within a batch. + private CloseableIterable newDeleteMarkingBatchIterable( + InputFile inputFile, + ReadBuilder readBuilder, + long start, + long length, + Expression residual, + Map idToConstant, + SparkDeleteFilter deleteFilter) { + PositionDeleteIndex deletePositions = + pushableDeletePositions(inputFile, start, length, residual, idToConstant, deleteFilter); + + Schema expectedSchema = deleteFilter.expectedSchema(); + Schema scanSchema = expectedSchema; + if (expectedSchema.findField(MetadataColumns.ROW_POSITION.fieldId()) == null) { + // _pos is needed to look up the delete state of each row; append it when not already + // projected + // and trim it back off the emitted batch. + List columns = Lists.newArrayList(expectedSchema.columns()); + columns.add(MetadataColumns.ROW_POSITION); + scanSchema = new Schema(columns); + } + int rowPositionIndex = scanSchema.columns().indexOf(MetadataColumns.ROW_POSITION); + int isDeletedIndex = scanSchema.columns().indexOf(MetadataColumns.IS_DELETED); + int outputColumnCount = expectedSchema.columns().size(); + + CloseableIterable iterable = + readBuilder + .project(scanSchema) + .idToConstant(idToConstant) + .split(start, length) + .filter(residual) + .caseSensitive(caseSensitive()) + .reuseContainers() + .withNameMapping(nameMapping()) + .build(); + + return CloseableIterable.transform( + iterable, + batch -> + markDeletedRows( + batch, deletePositions, rowPositionIndex, isDeletedIndex, outputColumnCount)); + } + + private static ColumnarBatch markDeletedRows( + ColumnarBatch batch, + PositionDeleteIndex deletePositions, + int rowPositionIndex, + int isDeletedIndex, + int outputColumnCount) { + int numRows = batch.numRows(); + ColumnVector rowPositions = batch.column(rowPositionIndex); + boolean[] isDeleted = new boolean[numRows]; + for (int row = 0; row < numRows; row++) { + isDeleted[row] = deletePositions.isDeleted(rowPositions.getLong(row)); + } + + DeletedColumnVector deletedColumn = new DeletedColumnVector(Types.BooleanType.get()); + deletedColumn.setValue(isDeleted); + + // Emit only the expected columns (_pos is dropped when it was appended just for marking) with + // the constant _deleted column replaced by the computed flags. + ColumnVector[] vectors = new ColumnVector[outputColumnCount]; + for (int i = 0; i < outputColumnCount; i++) { + vectors[i] = i == isDeletedIndex ? deletedColumn : batch.column(i); + } + + ColumnarBatch output = new ColumnarBatch(vectors); + output.setNumRows(numRows); + return output; + } + // Collects all rows to drop as file-relative positions: the position deletes for the file, plus // the positions of rows matching equality deletes (resolved by a row-oriented pre-scan). private PositionDeleteIndex pushableDeletePositions( diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index 52a21150d79b..f45e6a3d6953 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -27,12 +27,12 @@ /** * Exercises the Spark columnar read path for Vortex tables with deletes. * - *

Vortex applies all deletes (and residual filters) natively in the scan (see {@code - * BaseBatchReader.newPushdownBatchIterable}): position deletes are pushed directly, and equality - * deletes are resolved to positions by a pre-scan and pushed as well. So the position- and - * equality-delete cases inherited from {@link TestSparkReaderDeletes} run unchanged. Only the - * {@code _deleted}-metadata-column-with-delete-files cases are skipped: that requires retaining and - * marking deleted rows, which the drop-only pushdown path does not do. + *

Vortex applies deletes (and residual filters) natively in the scan (see {@code + * BaseBatchReader}): when deleted rows are dropped, position deletes are pushed directly and + * equality deletes are resolved to positions by a pre-scan; when the {@code _deleted} column is + * projected, rows are retained and flagged from their {@code _pos} instead. So the position-, + * equality-, and {@code _deleted}-delete cases inherited from {@link TestSparkReaderDeletes} all + * run. */ public class TestSparkVortexReaderDeletes extends TestSparkReaderDeletes { @@ -44,19 +44,13 @@ public static Object[][] parameters() { }; } - // Deletes are dropped inside the Vortex scan, so they never reach Spark and are not reflected in - // the NumDeletes metric. Disable delete-count assertions for this path. + // Deletes are applied inside the Vortex scan (or marked from _pos), so they never reach Spark and + // are not reflected in the NumDeletes metric. Disable delete-count assertions for this path. @Override protected boolean countDeletes() { return false; } - private static void skipDeletedColumn() { - Assumptions.abort( - "Vortex applies deletes via native scan pushdown (drop-only); the _deleted metadata column " - + "with delete files requires retaining and marking rows, which is not supported"); - } - @TestTemplate @Override public void testReadEqualityDeleteRows() { @@ -64,28 +58,4 @@ public void testReadEqualityDeleteRows() { // row positions, which is a separate limitation from equality-delete read support. Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); } - - @TestTemplate - @Override - public void testPosDeletesWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testEqualityDeleteWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testMixedPosAndEqDeletesWithDeletedColumn() { - skipDeletedColumn(); - } - - @TestTemplate - @Override - public void testFilterOnDeletedMetadataColumn() { - skipDeletedColumn(); - } } From 697beebdbb2b79dcd207ceab2a673cc3a9d4deec Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 16 Jun 2026 14:11:36 +0100 Subject: [PATCH 6/9] fixes Signed-off-by: Robert Kruszewski --- .../iceberg/vortex/VortexFormatModel.java | 5 ++--- .../apache/iceberg/vortex/VortexIterable.java | 21 ++++++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java index 23288531f6d5..4e9eb542b9a7 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexFormatModel.java @@ -432,9 +432,8 @@ public CloseableIterable build() { schema.columns().stream() .filter( field -> - (field.fieldId() == MetadataColumns.ROW_POSITION.fieldId()) - || !constants.containsKey(field.fieldId()) - && !MetadataColumns.isMetadataColumn(field.name())) + !constants.containsKey(field.fieldId()) + && !MetadataColumns.isMetadataColumn(field.name())) .map(Types.NestedField::name) .toList(); diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java index 4b881c9b2673..d685fc1a4845 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java @@ -27,11 +27,9 @@ import dev.vortex.jni.NativeRuntime; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; -import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.function.Function; @@ -60,6 +58,7 @@ public class VortexIterable extends CloseableGroup implements CloseableIterab private final Optional filterPredicate; private final long[] rowRange; private final byte[] posDeleteBitmap; + private final boolean includeRowPosition; private final Function> rowReaderFunc; private final Function> @@ -74,6 +73,7 @@ public class VortexIterable extends CloseableGroup implements CloseableIterab Optional filterPredicate, long[] rowRange, byte[] posDeleteBitmap, + boolean includeRowPosition, Function> readerFunction, Function> batchReaderFunction, boolean caseSensitive, @@ -83,6 +83,7 @@ public class VortexIterable extends CloseableGroup implements CloseableIterab this.filterPredicate = filterPredicate; this.rowRange = rowRange; this.posDeleteBitmap = posDeleteBitmap; + this.includeRowPosition = includeRowPosition; this.rowReaderFunc = readerFunction; this.batchReaderFunction = batchReaderFunction; this.caseSensitive = caseSensitive; @@ -130,19 +131,25 @@ public CloseableIterator iterator() { ImmutableList.Builder fieldNames = ImmutableList.builder(); ImmutableList.Builder expressions = ImmutableList.builder(); - org.apache.arrow.vector.types.pojo.Schema readerArrowSchema = fileArrowSchema; for (String name : projection) { if (fileColumns.contains(name)) { fieldNames.add(name); expressions.add(dev.vortex.api.Expression.column(name)); - } else if (Objects.equals(name, MetadataColumns.ROW_POSITION.name())) { - fieldNames.add(name); - expressions.add(dev.vortex.api.Expression.rowIdx()); - readerArrowSchema = appendRowPosition(fileArrowSchema); } } + // Row position is not a stored column. When requested, materialize it from Vortex's `row_idx` + // scan expression packed under the _pos metadata-column name, and append a matching _pos field + // to the schema handed to the reader. Both bind by name, so _pos resolves regardless of where + // it lands in the projected column order. + org.apache.arrow.vector.types.pojo.Schema readerArrowSchema = fileArrowSchema; + if (includeRowPosition) { + fieldNames.add(MetadataColumns.ROW_POSITION.name()); + expressions.add(dev.vortex.api.Expression.rowIdx()); + readerArrowSchema = appendRowPosition(fileArrowSchema); + } + dev.vortex.api.Expression scanProjection = dev.vortex.api.Expression.pack( fieldNames.build().toArray(String[]::new), From 9decaef62cd449db1b3893801388476f74db3f8e Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 16 Jun 2026 14:22:49 +0100 Subject: [PATCH 7/9] checkstyle --- .../java/org/apache/iceberg/vortex/VortexArrowBridge.java | 4 ++-- .../main/java/org/apache/iceberg/vortex/VortexIterable.java | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java index 98512d4f7757..185124efefcc 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexArrowBridge.java @@ -19,7 +19,6 @@ package org.apache.iceberg.vortex; import dev.vortex.arrow.ArrowAllocation; -import java.util.ArrayList; import java.util.List; import org.apache.arrow.c.ArrowArray; import org.apache.arrow.c.ArrowSchema; @@ -30,6 +29,7 @@ import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.UInt8Vector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; final class VortexArrowBridge { private static final RootAllocator ARROW_ALLOCATOR = new RootAllocator(Long.MAX_VALUE); @@ -79,7 +79,7 @@ private static VectorSchemaRoot normalizeUnsignedLongs( return imported; } - List vectors = new ArrayList<>(imported.getFieldVectors().size()); + List vectors = Lists.newArrayListWithCapacity(imported.getFieldVectors().size()); for (FieldVector vector : imported.getFieldVectors()) { if (vector instanceof UInt8Vector) { vectors.add(copyAsSigned((UInt8Vector) vector, allocator)); diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java index d685fc1a4845..78c0ca59594e 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java @@ -48,6 +48,7 @@ import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -192,7 +193,7 @@ public CloseableIterator iterator() { */ private static org.apache.arrow.vector.types.pojo.Schema appendRowPosition( org.apache.arrow.vector.types.pojo.Schema base) { - List fields = new ArrayList<>(base.getFields()); + List fields = Lists.newArrayList(base.getFields()); fields.add( new Field( MetadataColumns.ROW_POSITION.name(), From e2c65a836dd7e5b38593108ab7958f3f5faf381a Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Wed, 17 Jun 2026 01:43:55 +0100 Subject: [PATCH 8/9] format --- .../src/main/java/org/apache/iceberg/vortex/VortexIterable.java | 1 - 1 file changed, 1 deletion(-) diff --git a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java index 78c0ca59594e..4e85136ade1d 100644 --- a/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java +++ b/vortex/src/main/java/org/apache/iceberg/vortex/VortexIterable.java @@ -26,7 +26,6 @@ import dev.vortex.api.Session; import dev.vortex.jni.NativeRuntime; import java.io.IOException; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; From 249f14d2534ba5ffe8031cea78f9e766572eb237 Mon Sep 17 00:00:00 2001 From: Robert Kruszewski Date: Tue, 23 Jun 2026 13:23:35 +0100 Subject: [PATCH 9/9] fix --- .../iceberg/spark/source/TestSparkVortexReaderDeletes.java | 5 +++-- .../iceberg/spark/source/TestSparkVortexReaderDeletes.java | 5 +++-- .../iceberg/spark/source/TestSparkVortexReaderDeletes.java | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index f45e6a3d6953..451685cdb21b 100644 --- a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -18,10 +18,11 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assumptions.assumeThat; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Parameters; import org.apache.iceberg.PlanningMode; -import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.TestTemplate; /** @@ -56,6 +57,6 @@ protected boolean countDeletes() { public void testReadEqualityDeleteRows() { // Uses EqualityDeleteRowReader with byte-range task planning; Vortex interprets split ranges as // row positions, which is a separate limitation from equality-delete read support. - Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); + assumeThat(false).isTrue(); } } diff --git a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index f45e6a3d6953..451685cdb21b 100644 --- a/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v4.0/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -18,10 +18,11 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assumptions.assumeThat; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Parameters; import org.apache.iceberg.PlanningMode; -import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.TestTemplate; /** @@ -56,6 +57,6 @@ protected boolean countDeletes() { public void testReadEqualityDeleteRows() { // Uses EqualityDeleteRowReader with byte-range task planning; Vortex interprets split ranges as // row positions, which is a separate limitation from equality-delete read support. - Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); + assumeThat(false).isTrue(); } } diff --git a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java index f45e6a3d6953..451685cdb21b 100644 --- a/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java +++ b/spark/v4.1/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkVortexReaderDeletes.java @@ -18,10 +18,11 @@ */ package org.apache.iceberg.spark.source; +import static org.assertj.core.api.Assumptions.assumeThat; + import org.apache.iceberg.FileFormat; import org.apache.iceberg.Parameters; import org.apache.iceberg.PlanningMode; -import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.TestTemplate; /** @@ -56,6 +57,6 @@ protected boolean countDeletes() { public void testReadEqualityDeleteRows() { // Uses EqualityDeleteRowReader with byte-range task planning; Vortex interprets split ranges as // row positions, which is a separate limitation from equality-delete read support. - Assumptions.abort("EqualityDeleteRowReader uses byte-range splits, unsupported by Vortex"); + assumeThat(false).isTrue(); } }