Skip to content

Commit afe95bd

Browse files
yaooqinncloud-fan
authored andcommitted
[SPARK-31892][SQL] Disable week-based date filed for parsing
### What changes were proposed in this pull request? This PR disables week-based date filed for parsing closes apache#28674 ### Why are the changes needed? 1. It's an un-fixable behavior change to fill the gap between SimpleDateFormat and DateTimeFormater and backward-compatibility for different JDKs.A lot of effort has been made to prove it at apache#28674 2. The existing behavior itself in 2.4 is confusing, e.g. ```sql spark-sql> select to_timestamp('1', 'w'); 1969-12-28 00:00:00 spark-sql> select to_timestamp('1', 'u'); 1970-01-05 00:00:00 ``` The 'u' here seems not to go to the Monday of the first week in week-based form or the first day of the year in non-week-based form but go to the Monday of the second week in week-based form. And, e.g. ```sql spark-sql> select to_timestamp('2020 2020', 'YYYY yyyy'); 2020-01-01 00:00:00 spark-sql> select to_timestamp('2020 2020', 'yyyy YYYY'); 2019-12-29 00:00:00 spark-sql> select to_timestamp('2020 2020 1', 'YYYY yyyy w'); NULL spark-sql> select to_timestamp('2020 2020 1', 'yyyy YYYY w'); 2019-12-29 00:00:00 ``` I think we don't need to introduce all the weird behavior from Java 3. The current test coverage for week-based date fields is almost 0%, which indicates that we've never imagined using it. 4. Avoiding JDK bugs https://issues.apache.org/jira/browse/SPARK-31880 ### Does this PR introduce _any_ user-facing change? Yes, the 'Y/W/w/u/F/E' pattern cannot be used datetime parsing functions. ### How was this patch tested? more tests added Closes apache#28706 from yaooqinn/SPARK-31892. Authored-by: Kent Yao <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent c59f51b commit afe95bd

File tree

18 files changed

+102
-48
lines changed

18 files changed

+102
-48
lines changed

docs/sql-ref-datetime-pattern.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,8 @@ The count of pattern letters determines the format.
136136
During formatting, all valid data will be output even it is in the optional section.
137137
During parsing, the whole section may be missing from the parsed string.
138138
An optional section is started by `[` and ended using `]` (or at the end of the pattern).
139+
140+
- Symbols of 'Y', 'W', 'w', 'E', 'u', 'F', 'q' and 'Q' can only be used for datetime formatting, e.g. `date_format`. They are not allowed used for datetime parsing, e.g. `to_timestamp`.
139141
140142
More details for the text style:
141143

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -525,7 +525,7 @@ object CatalogColumnStat extends Logging {
525525
TimestampFormatter(
526526
format = "yyyy-MM-dd HH:mm:ss.SSSSSS",
527527
zoneId = ZoneOffset.UTC,
528-
needVarLengthSecondFraction = isParsing)
528+
isParsing = isParsing)
529529
}
530530

531531
/**

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVInferSchema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class CSVInferSchema(val options: CSVOptions) extends Serializable {
3535
options.zoneId,
3636
options.locale,
3737
legacyFormat = FAST_DATE_FORMAT,
38-
needVarLengthSecondFraction = true)
38+
isParsing = true)
3939

4040
private val decimalParser = if (options.locale == Locale.US) {
4141
// Special handling the default locale for backward compatibility

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityGenerator.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,13 @@ class UnivocityGenerator(
4747
options.zoneId,
4848
options.locale,
4949
legacyFormat = FAST_DATE_FORMAT,
50-
needVarLengthSecondFraction = false)
50+
isParsing = false)
5151
private val dateFormatter = DateFormatter(
5252
options.dateFormat,
5353
options.zoneId,
5454
options.locale,
55-
legacyFormat = FAST_DATE_FORMAT)
55+
legacyFormat = FAST_DATE_FORMAT,
56+
isParsing = false)
5657

5758
private def makeConverter(dataType: DataType): ValueConverter = dataType match {
5859
case DateType =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,13 @@ class UnivocityParser(
9090
options.zoneId,
9191
options.locale,
9292
legacyFormat = FAST_DATE_FORMAT,
93-
needVarLengthSecondFraction = true)
93+
isParsing = true)
9494
private lazy val dateFormatter = DateFormatter(
9595
options.dateFormat,
9696
options.zoneId,
9797
options.locale,
98-
legacyFormat = FAST_DATE_FORMAT)
98+
legacyFormat = FAST_DATE_FORMAT,
99+
isParsing = true)
99100

100101
private val csvFilters = new CSVFilters(filters, requiredSchema)
101102

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
734734
format.toString,
735735
zoneId,
736736
legacyFormat = SIMPLE_DATE_FORMAT,
737-
needVarLengthSecondFraction = false)
737+
isParsing = false)
738738
}
739739
} else None
740740
}
@@ -745,7 +745,7 @@ case class DateFormatClass(left: Expression, right: Expression, timeZoneId: Opti
745745
format.toString,
746746
zoneId,
747747
legacyFormat = SIMPLE_DATE_FORMAT,
748-
needVarLengthSecondFraction = false)
748+
isParsing = false)
749749
} else {
750750
formatter.get
751751
}
@@ -890,7 +890,7 @@ abstract class ToTimestamp
890890
constFormat.toString,
891891
zoneId,
892892
legacyFormat = SIMPLE_DATE_FORMAT,
893-
needVarLengthSecondFraction = true)
893+
isParsing = true)
894894
} catch {
895895
case e: SparkUpgradeException => throw e
896896
case NonFatal(_) => null
@@ -929,7 +929,7 @@ abstract class ToTimestamp
929929
formatString,
930930
zoneId,
931931
legacyFormat = SIMPLE_DATE_FORMAT,
932-
needVarLengthSecondFraction = true)
932+
isParsing = true)
933933
.parse(t.asInstanceOf[UTF8String].toString) / downScaleFactor
934934
} catch {
935935
case e: SparkUpgradeException => throw e
@@ -1072,7 +1072,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
10721072
constFormat.toString,
10731073
zoneId,
10741074
legacyFormat = SIMPLE_DATE_FORMAT,
1075-
needVarLengthSecondFraction = false)
1075+
isParsing = false)
10761076
} catch {
10771077
case e: SparkUpgradeException => throw e
10781078
case NonFatal(_) => null
@@ -1105,7 +1105,7 @@ case class FromUnixTime(sec: Expression, format: Expression, timeZoneId: Option[
11051105
f.toString,
11061106
zoneId,
11071107
legacyFormat = SIMPLE_DATE_FORMAT,
1108-
needVarLengthSecondFraction = false)
1108+
isParsing = false)
11091109
.format(time.asInstanceOf[Long] * MICROS_PER_SECOND))
11101110
} catch {
11111111
case e: SparkUpgradeException => throw e

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,13 @@ private[sql] class JacksonGenerator(
8383
options.zoneId,
8484
options.locale,
8585
legacyFormat = FAST_DATE_FORMAT,
86-
needVarLengthSecondFraction = false)
86+
isParsing = false)
8787
private val dateFormatter = DateFormatter(
8888
options.dateFormat,
8989
options.zoneId,
9090
options.locale,
91-
legacyFormat = FAST_DATE_FORMAT)
91+
legacyFormat = FAST_DATE_FORMAT,
92+
isParsing = false)
9293

9394
private def makeWriter(dataType: DataType): ValueWriter = dataType match {
9495
case NullType =>

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,13 @@ class JacksonParser(
6161
options.zoneId,
6262
options.locale,
6363
legacyFormat = FAST_DATE_FORMAT,
64-
needVarLengthSecondFraction = true)
64+
isParsing = true)
6565
private lazy val dateFormatter = DateFormatter(
6666
options.dateFormat,
6767
options.zoneId,
6868
options.locale,
69-
legacyFormat = FAST_DATE_FORMAT)
69+
legacyFormat = FAST_DATE_FORMAT,
70+
isParsing = true)
7071

7172
/**
7273
* Create a converter which converts the JSON documents held by the `JsonParser`

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
4343
options.zoneId,
4444
options.locale,
4545
legacyFormat = FAST_DATE_FORMAT,
46-
needVarLengthSecondFraction = true)
46+
isParsing = true)
4747

4848
/**
4949
* Infer the type of a collection of json records in three stages:

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.util
1919

2020
import java.text.SimpleDateFormat
2121
import java.time.{LocalDate, ZoneId}
22-
import java.time.format.DateTimeFormatter
2322
import java.util.{Date, Locale}
2423

2524
import org.apache.commons.lang3.time.FastDateFormat
@@ -42,7 +41,8 @@ class Iso8601DateFormatter(
4241
pattern: String,
4342
zoneId: ZoneId,
4443
locale: Locale,
45-
legacyFormat: LegacyDateFormats.LegacyDateFormat)
44+
legacyFormat: LegacyDateFormats.LegacyDateFormat,
45+
isParsing: Boolean)
4646
extends DateFormatter with DateTimeFormatterHelper {
4747

4848
@transient
@@ -131,12 +131,13 @@ object DateFormatter {
131131
format: Option[String],
132132
zoneId: ZoneId,
133133
locale: Locale = defaultLocale,
134-
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT): DateFormatter = {
134+
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
135+
isParsing: Boolean = true): DateFormatter = {
135136
val pattern = format.getOrElse(defaultPattern)
136137
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
137138
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
138139
} else {
139-
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat)
140+
val df = new Iso8601DateFormatter(pattern, zoneId, locale, legacyFormat, isParsing)
140141
df.validatePatternString()
141142
df
142143
}
@@ -159,8 +160,9 @@ object DateFormatter {
159160
format: String,
160161
zoneId: ZoneId,
161162
locale: Locale,
162-
legacyFormat: LegacyDateFormat): DateFormatter = {
163-
getFormatter(Some(format), zoneId, locale, legacyFormat)
163+
legacyFormat: LegacyDateFormat,
164+
isParsing: Boolean): DateFormatter = {
165+
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
164166
}
165167

166168
def apply(format: String, zoneId: ZoneId): DateFormatter = {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ trait DateTimeFormatterHelper {
8989
protected def getOrCreateFormatter(
9090
pattern: String,
9191
locale: Locale,
92-
needVarLengthSecondFraction: Boolean = false): DateTimeFormatter = {
93-
val newPattern = convertIncompatiblePattern(pattern)
94-
val useVarLen = needVarLengthSecondFraction && newPattern.contains('S')
92+
isParsing: Boolean = false): DateTimeFormatter = {
93+
val newPattern = convertIncompatiblePattern(pattern, isParsing)
94+
val useVarLen = isParsing && newPattern.contains('S')
9595
val key = (newPattern, locale, useVarLen)
9696
var formatter = cache.getIfPresent(key)
9797
if (formatter == null) {
@@ -227,6 +227,12 @@ private object DateTimeFormatterHelper {
227227
formatter.format(LocalDate.of(2000, 1, 1)) == "1 1"
228228
}
229229
final val unsupportedLetters = Set('A', 'c', 'e', 'n', 'N', 'p')
230+
// SPARK-31892: The week-based date fields are rarely used and really confusing for parsing values
231+
// to datetime, especially when they are mixed with other non-week-based ones
232+
// The quarter fields will also be parsed strangely, e.g. when the pattern contains `yMd` and can
233+
// be directly resolved then the `q` do check for whether the month is valid, but if the date
234+
// fields is incomplete, e.g. `yM`, the checking will be bypassed.
235+
final val unsupportedLettersForParsing = Set('Y', 'W', 'w', 'E', 'u', 'F', 'q', 'Q')
230236
final val unsupportedPatternLengths = {
231237
// SPARK-31771: Disable Narrow-form TextStyle to avoid silent data change, as it is Full-form in
232238
// 2.4
@@ -246,7 +252,7 @@ private object DateTimeFormatterHelper {
246252
* @param pattern The input pattern.
247253
* @return The pattern for new parser
248254
*/
249-
def convertIncompatiblePattern(pattern: String): String = {
255+
def convertIncompatiblePattern(pattern: String, isParsing: Boolean = false): String = {
250256
val eraDesignatorContained = pattern.split("'").zipWithIndex.exists {
251257
case (patternPart, index) =>
252258
// Text can be quoted using single quotes, we only check the non-quote parts.
@@ -255,7 +261,8 @@ private object DateTimeFormatterHelper {
255261
(pattern + " ").split("'").zipWithIndex.map {
256262
case (patternPart, index) =>
257263
if (index % 2 == 0) {
258-
for (c <- patternPart if unsupportedLetters.contains(c)) {
264+
for (c <- patternPart if unsupportedLetters.contains(c) ||
265+
(isParsing && unsupportedLettersForParsing.contains(c))) {
259266
throw new IllegalArgumentException(s"Illegal pattern character: $c")
260267
}
261268
for (style <- unsupportedPatternLengths if patternPart.contains(style)) {

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -293,13 +293,13 @@ object TimestampFormatter {
293293
zoneId: ZoneId,
294294
locale: Locale = defaultLocale,
295295
legacyFormat: LegacyDateFormat = LENIENT_SIMPLE_DATE_FORMAT,
296-
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
296+
isParsing: Boolean = false): TimestampFormatter = {
297297
val pattern = format.getOrElse(defaultPattern)
298298
if (SQLConf.get.legacyTimeParserPolicy == LEGACY) {
299299
getLegacyFormatter(pattern, zoneId, locale, legacyFormat)
300300
} else {
301301
val tf = new Iso8601TimestampFormatter(
302-
pattern, zoneId, locale, legacyFormat, needVarLengthSecondFraction)
302+
pattern, zoneId, locale, legacyFormat, isParsing)
303303
tf.validatePatternString()
304304
tf
305305
}
@@ -325,23 +325,23 @@ object TimestampFormatter {
325325
zoneId: ZoneId,
326326
locale: Locale,
327327
legacyFormat: LegacyDateFormat,
328-
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
329-
getFormatter(Some(format), zoneId, locale, legacyFormat, needVarLengthSecondFraction)
328+
isParsing: Boolean): TimestampFormatter = {
329+
getFormatter(Some(format), zoneId, locale, legacyFormat, isParsing)
330330
}
331331

332332
def apply(
333333
format: String,
334334
zoneId: ZoneId,
335335
legacyFormat: LegacyDateFormat,
336-
needVarLengthSecondFraction: Boolean): TimestampFormatter = {
337-
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, needVarLengthSecondFraction)
336+
isParsing: Boolean): TimestampFormatter = {
337+
getFormatter(Some(format), zoneId, defaultLocale, legacyFormat, isParsing)
338338
}
339339

340340
def apply(
341341
format: String,
342342
zoneId: ZoneId,
343-
needVarLengthSecondFraction: Boolean = false): TimestampFormatter = {
344-
getFormatter(Some(format), zoneId, needVarLengthSecondFraction = needVarLengthSecondFraction)
343+
isParsing: Boolean = false): TimestampFormatter = {
344+
getFormatter(Some(format), zoneId, isParsing = isParsing)
345345
}
346346

347347
def apply(zoneId: ZoneId): TimestampFormatter = {

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1168,4 +1168,33 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
11681168
checkExceptionInExpression[ArithmeticException](
11691169
MillisToTimestamp(Literal(-92233720368547758L)), "long overflow")
11701170
}
1171+
1172+
test("Disable week-based date fields and quarter fields for parsing") {
1173+
1174+
def checkSparkUpgrade(c: Char): Unit = {
1175+
checkExceptionInExpression[SparkUpgradeException](
1176+
new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, "3.0")
1177+
checkExceptionInExpression[SparkUpgradeException](
1178+
new ParseToDate(Literal("1"), Literal(c.toString)).child, "3.0")
1179+
checkExceptionInExpression[SparkUpgradeException](
1180+
ToUnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
1181+
checkExceptionInExpression[SparkUpgradeException](
1182+
UnixTimestamp(Literal("1"), Literal(c.toString)), "3.0")
1183+
}
1184+
1185+
def checkNullify(c: Char): Unit = {
1186+
checkEvaluation(new ParseToTimestamp(Literal("1"), Literal(c.toString)).child, null)
1187+
checkEvaluation(new ParseToDate(Literal("1"), Literal(c.toString)).child, null)
1188+
checkEvaluation(ToUnixTimestamp(Literal("1"), Literal(c.toString)), null)
1189+
checkEvaluation(UnixTimestamp(Literal("1"), Literal(c.toString)), null)
1190+
}
1191+
1192+
Seq('Y', 'W', 'w', 'E', 'u', 'F').foreach { l =>
1193+
checkSparkUpgrade(l)
1194+
}
1195+
1196+
Seq('q', 'Q').foreach { l =>
1197+
checkNullify(l)
1198+
}
1199+
}
11711200
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelperSuite.scala

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.sql.catalyst.util
1919

20-
import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
20+
import org.apache.spark.SparkFunSuite
2121
import org.apache.spark.sql.catalyst.util.DateTimeFormatterHelper._
2222

2323
class DateTimeFormatterHelperSuite extends SparkFunSuite {
@@ -40,6 +40,13 @@ class DateTimeFormatterHelperSuite extends SparkFunSuite {
4040
val e = intercept[IllegalArgumentException](convertIncompatiblePattern(s"yyyy-MM-dd $l G"))
4141
assert(e.getMessage === s"Illegal pattern character: $l")
4242
}
43+
unsupportedLettersForParsing.foreach { l =>
44+
val e = intercept[IllegalArgumentException] {
45+
convertIncompatiblePattern(s"$l", isParsing = true)
46+
}
47+
assert(e.getMessage === s"Illegal pattern character: $l")
48+
assert(convertIncompatiblePattern(s"$l").nonEmpty)
49+
}
4350
unsupportedPatternLengths.foreach { style =>
4451
val e1 = intercept[IllegalArgumentException] {
4552
convertIncompatiblePattern(s"yyyy-MM-dd $style")

sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
7272
DateFormatter.defaultPattern,
7373
getZoneId(timeZone),
7474
DateFormatter.defaultLocale,
75-
legacyFormat)
75+
legacyFormat,
76+
isParsing = false)
7677
val days = formatter.parse(date)
7778
assert(date === formatter.format(days))
7879
assert(date === formatter.format(daysToLocalDate(days)))
@@ -106,7 +107,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
106107
DateFormatter.defaultPattern,
107108
getZoneId(timeZone),
108109
DateFormatter.defaultLocale,
109-
legacyFormat)
110+
legacyFormat,
111+
isParsing = false)
110112
val date = formatter.format(days)
111113
val parsed = formatter.parse(date)
112114
assert(days === parsed)
@@ -173,7 +175,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
173175
DateFormatter.defaultPattern,
174176
getZoneId(timeZone),
175177
DateFormatter.defaultLocale,
176-
legacyFormat)
178+
legacyFormat,
179+
isParsing = false)
177180
assert(LocalDate.ofEpochDay(formatter.parse("1000-01-01")) === LocalDate.of(1000, 1, 1))
178181
assert(formatter.format(LocalDate.of(1000, 1, 1)) === "1000-01-01")
179182
assert(formatter.format(localDateToDays(LocalDate.of(1000, 1, 1))) === "1000-01-01")

0 commit comments

Comments
 (0)