[3.3] Fix timestamp data-skipping stats for truncated timezone offsets

AnudeepKonaboina · AnudeepKonaboina · commit ec7f2c1369ad · 2025-11-25T15:39:24.000+05:30
Signed-off-by: AnudeepKonaboina &lt;krantianudeep@gmail.com&gt;
diff --git a/PROTOCOL.md b/PROTOCOL.md
@@ -1874,7 +1874,7 @@ nullCount | The number of `null` values for this column | <p>If the `nullCount`
 minValues | A value that is equal to the smallest valid value[^1] present in the file for this column. If all valid rows are null, this carries no information. | A value that is less than or equal to all valid values[^1] present in this file for this column. If all valid rows are null, this carries no information.
 maxValues | A value that is equal to the largest valid value[^1] present in the file for this column. If all valid rows are null, this carries no information. | A value that is greater than or equal to all valid values[^1] present in this file for this column. If all valid rows are null, this carries no information.
 
-[^1]: String columns are cut off at a fixed prefix length. Timestamp columns are truncated down to milliseconds.
+[^1]: String columns are cut off at a fixed prefix length. Timestamp columns are truncated down to milliseconds. Implementations **must not** truncate timezone offsets in timestamp statistics to minute precision in a way that changes the represented instant. Modern writers SHOULD encode timestamp statistics as instants in UTC with microsecond precision (for example, using an ISO 8601 representation adjusted to UTC, such as `1970-01-01T00:00:00.123456Z`). Readers MUST treat statistics as approximate bounds and MAY widen the effective min/max range to avoid incorrectly skipping files when older writers produced truncated timestamp statistics.
 
 ## Partition Value Serialization
 
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/DataSkippingReader.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/DataSkippingReader.scala
@@ -186,6 +186,21 @@ private[delta] object DataSkippingReader {
     val oneMillisecond = new CalendarInterval(0, 0, 1000 /* micros */)
     new Literal(oneMillisecond, CalendarIntervalType)
   }
+  // SC-22824 widened the max timestamp stats range by +1 ms to account for JSON truncation down to
+  // milliseconds. For tables written by Delta 3.3.2, timestamp stats in JSON may also be off by up
+  // to 59 seconds due to historical timezone offsets being truncated to minute resolution (see
+  // Delta issue 5249). To avoid incorrect data skipping (pruning files that may still contain
+  // matching rows), we treat these stats as approximate bounds and widen the range by up to
+  // 59 seconds on both sides for timestamp stats.
+  private val fiftyNineSecondsMicros: Long = 59L * 1000000L
+  val plusFiftyNineSecondsLiteralExpr: Literal = {
+    val interval = new CalendarInterval(0, 0, fiftyNineSecondsMicros)
+    new Literal(interval, CalendarIntervalType)
+  }
+  val minusFiftyNineSecondsLiteralExpr: Literal = {
+    val interval = new CalendarInterval(0, 0, -fiftyNineSecondsMicros)
+    new Literal(interval, CalendarIntervalType)
+  }
 
   lazy val sizeCollectorInputEncoders: Seq[Option[ExpressionEncoder[_]]] = Seq(
     Option(ExpressionEncoder[Boolean]()),
@@ -949,16 +964,38 @@ trait DataSkippingReaderBase
       // Filter out non-leaf columns -- they lack stats so skipping predicates can't use them.
       .filterNot(_._2.isInstanceOf[StructType])
       .map {
+        case (statCol, TimestampType, _) if pathToStatType.head == MIN =>
+          // Delta Spark 3.3.2 wrote timestamp stats to JSON using timezone offsets with seconds
+          // truncated down to minute resolution (e.g. +005328 -> +0053). When these values are
+          // parsed back as timestamps, they can be off by up to 59 seconds from the true value,
+          // which may cause incorrect data skipping (files being pruned even though they might
+          // contain matching rows).
+          //
+          // To avoid data loss for such tables, we treat the JSON stats for timestamps as
+          // approximate and widen the range by up to 59 seconds on both sides. For the min
+          // bound, subtract 59 seconds.
+          Column(
+            Cast(TimeAdd(statCol.expr, minusFiftyNineSecondsLiteralExpr), TimestampType))
         case (statCol, TimestampType, _) if pathToStatType.head == MAX =>
           // SC-22824: For timestamps, JSON serialization will truncate to milliseconds. This means
           // that we must adjust 1 millisecond upwards for max stats, or we will incorrectly skip
           // records that differ only in microsecond precision. (For example, a file containing only
           // 01:02:03.456789 will be written with min == max == 01:02:03.456, so we must consider it
           // to contain the range from 01:02:03.456 to 01:02:03.457.)
           //
-          // There is a longer term task SC-22825 to fix the serialization problem that caused this.
-          // But we need the adjustment in any case to correctly read stats written by old versions.
-          Column(Cast(TimeAdd(statCol.expr, oneMillisecondLiteralExpr), TimestampType))
+          // Delta Spark 3.3.2 also truncated historical timezone offsets to minute resolution in
+          // JSON stats, which can make the recorded max timestamp up to 59 seconds earlier than
+          // the true value. To avoid incorrectly skipping files for such tables, we further widen
+          // the upper bound by 59 seconds.
+          //
+          // There is a longer term task SC-22825 to fix the serialization problem that caused the
+          // millisecond truncation, and Delta issue 5249 tracks the timezone offset truncation.
+          // We need this adjustment in any case to correctly read stats written by old versions.
+          val widened =
+            TimeAdd(
+              TimeAdd(statCol.expr, oneMillisecondLiteralExpr),
+              plusFiftyNineSecondsLiteralExpr)
+          Column(Cast(widened, TimestampType))
         case (statCol, TimestampNTZType, _) if pathToStatType.head == MAX =>
           // We also apply the same adjustment of max stats that was applied to Timestamp
           // for TimestampNTZ because these 2 types have the same precision in terms of time.
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala b/spark/src/main/scala/org/apache/spark/sql/delta/stats/StatisticsCollection.scala
@@ -46,7 +46,7 @@ import org.apache.spark.sql.catalyst.parser.{AbstractSqlParser, AstBuilder, Pars
 import org.apache.spark.sql.catalyst.parser.SqlBaseParser.MultipartIdentifierListContext
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.functions.lit
-import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.types._
 
 /**
@@ -243,6 +243,15 @@ trait StatisticsCollection extends DeltaLogging {
     val stringPrefix =
       spark.sessionState.conf.getConf(DeltaSQLConf.DATA_SKIPPING_STRING_PREFIX_LENGTH)
 
+    // Formatter used to serialize timestamp MIN/MAX statistics to JSON. Historically, Spark 3.3.2
+    // truncated historical timezone offsets down to minute precision when rendering timestamps,
+    // which could make the serialized stats differ from the true values by up to 59 seconds (see
+    // Delta issue 5249). To avoid that, we mirror Delta 4.0 and explicitly format timestamp stats
+    // using a pattern that preserves offset seconds: `yyyy-MM-dd'T'HH:mm:ss.SSSXXXXX`.
+    val sessionTimeZoneId = spark.sessionState.conf.sessionLocalTimeZone
+    val timestampStatsFormatterUdf =
+      StatisticsCollection.timestampStatsFormatterUdf(sessionTimeZoneId)
+
     // On file initialization/stat recomputation TIGHT_BOUNDS is always set to true
     val tightBoundsColOpt =
       Option.when(deletionVectorsSupported &&
@@ -257,6 +266,10 @@ trait StatisticsCollection extends DeltaLogging {
         case (c, SkippingEligibleDataType(StringType), true) =>
           substring(min(c), 0, stringPrefix)
 
+        // Format timestamp min stats using a pattern that preserves offset seconds.
+        case (c, SkippingEligibleDataType(TimestampType), true) =>
+          timestampStatsFormatterUdf(min(c).cast(TimestampType))
+
         // Collect all numeric min values
         case (c, SkippingEligibleDataType(_), true) =>
           min(c)
@@ -268,6 +281,10 @@ trait StatisticsCollection extends DeltaLogging {
             DeltaUDF.stringFromString(StatisticsCollection.truncateMaxStringAgg(stringPrefix)_)
           udfTruncateMax(max(c))
 
+        // Format timestamp max stats using a pattern that preserves offset seconds.
+        case (c, SkippingEligibleDataType(TimestampType), true) =>
+          timestampStatsFormatterUdf(max(c).cast(TimestampType))
+
         // Collect all numeric max values
         case (c, SkippingEligibleDataType(_), true) =>
           max(c)
@@ -409,6 +426,35 @@ object StatisticsCollection extends DeltaCommand {
 
   val UTF8_MAX_CHARACTER = new String(Character.toChars(Character.MAX_CODE_POINT))
 
+  /**
+   * Builds a UDF for formatting timestamp statistics using a pattern that preserves offset seconds:
+   * `yyyy-MM-dd'T'HH:mm:ss.SSSXXXXX`. This mirrors Delta 4.0 behavior so that new tables write
+   * precise timestamp stats, while older tables with truncated offsets are handled by the
+   * reader-side widening logic in `DataSkippingReader`.
+   */
+  private[delta] def timestampStatsFormatterUdf(
+      sessionTimeZoneId: String): UserDefinedFunction = {
+    val timeZone =
+      org.apache.spark.sql.delta.util.DateTimeUtils.getTimeZone(sessionTimeZoneId)
+    val formatter =
+      org.apache.spark.sql.delta.util.TimestampFormatter(
+        "yyyy-MM-dd'T'HH:mm:ss.SSSXXXXX",
+        timeZone)
+    val formatTimestamp =
+      (ts: java.sql.Timestamp) => {
+        if (ts == null) {
+          null
+        } else {
+          val micros =
+            org.apache.spark.sql.delta.util.DateTimeUtils.fromJavaTimestamp(ts)
+          org.apache.spark.sql.delta.util.DateTimeUtils.timestampToString(
+            formatter,
+            micros)
+        }
+      }
+    udf(formatTimestamp)
+  }
+
   /**
    * The SQL grammar already includes a `multipartIdentifierList` rule for parsing a string into a
    * list of multi-part identifiers. We just expose it here, with a custom parser and AstBuilder.
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala b/spark/src/test/scala/org/apache/spark/sql/delta/stats/DataSkippingDeltaTests.scala
@@ -17,6 +17,7 @@
 package org.apache.spark.sql.delta.stats
 
 import java.io.File
+import java.sql.Timestamp
 
 import org.apache.spark.sql.delta._
 import org.apache.spark.sql.delta.actions.AddFile
@@ -1698,6 +1699,47 @@ trait DataSkippingDeltaTestsBase extends DeltaExcludedBySparkVersionTestMixinShi
     }
   }
 
+  test("data skipping with timestamp stats truncated by seconds (issue 5249)") {
+    // This test simulates a table whose timestamp stats in JSON are off by tens of seconds
+    // compared to the actual data values, similar to Delta 3.3.2 behavior described in
+    // https://github.com/delta-io/delta/issues/5249. The DataSkippingReader must treat
+    // such stats as approximate bounds and must NOT skip the file for an equality predicate.
+    withTempDir { dir =>
+      import testImplicits._
+
+      val ts = Timestamp.valueOf("2019-09-09 01:02:03.456789")
+      val df = Seq(ts).toDF("ts")
+      df.write.format("delta").save(dir.getCanonicalPath)
+
+      val log = DeltaLog.forTable(spark, dir.getCanonicalPath)
+
+      // Overwrite stats for the single AddFile to mimic an older writer that recorded
+      // timestamp stats that are off by ~30 seconds from the true value. We set both
+      // min and max to a timestamp 30 seconds *after* the actual value, which would
+      // previously cause data skipping to think that `ts = actual` cannot match.
+      val txn = log.startTransaction()
+      val addFile = txn.filterFiles(Nil).head
+
+      val fakeStatsJson =
+        """{
+          |  "numRecords": 1,
+          |  "minValues": {"ts": "2019-09-09 01:02:33.456789"},
+          |  "maxValues": {"ts": "2019-09-09 01:02:33.456789"},
+          |  "nullCount": {"ts": 0}
+          |}""".stripMargin
+
+      txn.commit(Seq(addFile.copy(stats = fakeStatsJson)), DeltaOperations.ComputeStats(Nil))
+      log.update()
+
+      val predicate = """ts = TIMESTAMP '2019-09-09 01:02:03.456789'"""
+      Given(predicate)
+      val numFiles = filesRead(log, predicate)
+      assert(numFiles == 1,
+        s"Expected timestamp file not to be skipped for equality predicate due to widened " +
+          s"timestamp stats bounds; filesRead was $numFiles")
+    }
+  }
+
   testSparkMasterOnly("data skipping by stats - variant type") {
     withTable("tbl") {
       sql("""CREATE TABLE tbl(v VARIANT,