-
Notifications
You must be signed in to change notification settings - Fork 2k
[Kernel-Spark] Fix startingVersion support on non re-creatable version by skipping start snapshot validation #5574
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
f6a99d7
da328ad
6536f15
f38a949
29fd301
bfa3e40
c5d9d23
36ebe0d
853370b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
|
|
||
| import static io.delta.kernel.spark.utils.ExpressionUtils.dsv2PredicateToCatalystExpression; | ||
|
|
||
| import io.delta.kernel.Snapshot; | ||
| import io.delta.kernel.data.FilteredColumnarBatch; | ||
| import io.delta.kernel.data.MapValue; | ||
| import io.delta.kernel.data.Row; | ||
|
|
@@ -55,6 +56,7 @@ | |
| public class SparkScan implements Scan, SupportsReportStatistics, SupportsRuntimeV2Filtering { | ||
|
|
||
| private final DeltaSnapshotManager snapshotManager; | ||
| private final Snapshot initialSnapshot; | ||
| private final StructType readDataSchema; | ||
| private final StructType dataSchema; | ||
| private final StructType partitionSchema; | ||
|
|
@@ -74,6 +76,7 @@ public class SparkScan implements Scan, SupportsReportStatistics, SupportsRuntim | |
|
|
||
| public SparkScan( | ||
| DeltaSnapshotManager snapshotManager, | ||
| io.delta.kernel.Snapshot initialSnapshot, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need to fully quality here?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1, let's avoid this |
||
| StructType dataSchema, | ||
| StructType partitionSchema, | ||
| StructType readDataSchema, | ||
|
|
@@ -83,6 +86,7 @@ public SparkScan( | |
| CaseInsensitiveStringMap options) { | ||
|
|
||
| this.snapshotManager = Objects.requireNonNull(snapshotManager, "snapshotManager is null"); | ||
| this.initialSnapshot = Objects.requireNonNull(initialSnapshot, "initialSnapshot is null"); | ||
| this.dataSchema = Objects.requireNonNull(dataSchema, "dataSchema is null"); | ||
| this.partitionSchema = Objects.requireNonNull(partitionSchema, "partitionSchema is null"); | ||
| this.readDataSchema = Objects.requireNonNull(readDataSchema, "readDataSchema is null"); | ||
|
|
@@ -130,7 +134,7 @@ public Batch toBatch() { | |
| public MicroBatchStream toMicroBatchStream(String checkpointLocation) { | ||
| DeltaOptions deltaOptions = new DeltaOptions(scalaOptions, sqlConf); | ||
| return new SparkMicroBatchStream( | ||
| snapshotManager, hadoopConf, SparkSession.active(), deltaOptions); | ||
| snapshotManager, initialSnapshot, hadoopConf, SparkSession.active(), deltaOptions); | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,10 +21,16 @@ | |
| import io.delta.kernel.data.ColumnVector; | ||
| import io.delta.kernel.data.ColumnarBatch; | ||
| import io.delta.kernel.data.Row; | ||
| import io.delta.kernel.engine.Engine; | ||
| import io.delta.kernel.internal.DeltaLogActionUtils; | ||
| import io.delta.kernel.internal.TableChangesUtils; | ||
| import io.delta.kernel.internal.actions.AddFile; | ||
| import io.delta.kernel.internal.actions.RemoveFile; | ||
| import io.delta.kernel.internal.commitrange.CommitRangeImpl; | ||
| import io.delta.kernel.internal.data.StructRow; | ||
| import io.delta.kernel.utils.CloseableIterator; | ||
| import java.util.Optional; | ||
| import java.util.Set; | ||
| import org.apache.spark.annotation.Experimental; | ||
|
|
||
| /** | ||
|
|
@@ -91,6 +97,32 @@ public static Optional<RemoveFile> getDataChangeRemove(ColumnarBatch batch, int | |
| return removeFile.getDataChange() ? Optional.of(removeFile) : Optional.empty(); | ||
| } | ||
|
|
||
| /** | ||
| * Gets actions from a commit range without requiring a snapshot at the exact start version. | ||
| * | ||
| * <p>This method is "unsafe" because it bypasses the standard {@code CommitRange.getActions()} | ||
| * API which requires a snapshot at the exact start version for protocol validation. | ||
| * | ||
| * <p>This is necessary for streaming scenarios where the start version might not have a | ||
| * recreatable snapshot (e.g., after log cleanup) or where {@code startingVersion} is used. | ||
| * | ||
| * @param engine the Delta engine | ||
| * @param commitRange the commit range to read actions from | ||
| * @param tablePath the path to the Delta table | ||
| * @param actionSet the set of actions to read (e.g., ADD, REMOVE) | ||
| * @return an iterator over columnar batches containing the requested actions | ||
| */ | ||
| public static CloseableIterator<ColumnarBatch> getActionsFromRangeUnsafe( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a comment explain in what way this is unsafe?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| Engine engine, | ||
| CommitRangeImpl commitRange, | ||
| String tablePath, | ||
| Set<DeltaLogActionUtils.DeltaAction> actionSet) { | ||
| return TableChangesUtils.flattenCommitsAndAddMetadata( | ||
| engine, | ||
| DeltaLogActionUtils.getActionsFromCommitFilesWithProtocolValidation( | ||
| engine, tablePath, commitRange.getDeltaFiles(), actionSet)); | ||
| } | ||
|
|
||
| /** Private constructor to prevent instantiation of this utility class. */ | ||
| private StreamingHelper() {} | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a comment here documenting the exact use case for why we are calling getActionsFromRangeUnsafe?