[SPARK-54352][SQL] Introduce SQLConf.canonicalize to centralize normalization of strings

mihailoale-db · cloud-fan · commit 96e0d4b94247 · 2025-11-17T10:41:15.000+08:00
### What changes were proposed in this pull request? In this PR I propose that we introduce `SQLConf.canonicalize` to centralize normalization of strings. This will help us avoid the code duplication and ease the development of the single-pass analyzer. ### Why are the changes needed? To ease the development of the single-pass analyzer (avoid code duplication). ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #53063 from mihailoale-db/canonicalizer. Authored-by: mihailoale-db <mihailo.aleksic@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/higherOrderFunctions.scala
@@ -41,16 +41,6 @@ object ResolveLambdaVariables extends Rule[LogicalPlan] {
 
   type LambdaVariableMap = Map[String, NamedExpression]
 
-  private def canonicalizer = {
-    if (!conf.caseSensitiveAnalysis) {
-      // scalastyle:off caselocale
-      s: String => s.toLowerCase
-      // scalastyle:on caselocale
-    } else {
-      s: String => s
-    }
-  }
-
   override def apply(plan: LogicalPlan): LogicalPlan = {
     plan.resolveOperatorsWithPruning(
       _.containsAnyPattern(HIGH_ORDER_FUNCTION, LAMBDA_FUNCTION, LAMBDA_VARIABLE), ruleId) {
@@ -80,11 +70,11 @@ object ResolveLambdaVariables extends Rule[LogicalPlan] {
             "actualNumArgs" -> argInfo.size.toString))
       }
 
-      if (names.map(a => canonicalizer(a.name)).distinct.size < names.size) {
+      if (names.map(a => conf.canonicalize(a.name)).distinct.size < names.size) {
         e.failAnalysis(
           errorClass = "INVALID_LAMBDA_FUNCTION_CALL.DUPLICATE_ARG_NAMES",
           messageParameters = Map(
-            "args" -> names.map(a => canonicalizer(a.name)).map(toSQLId(_)).mkString(", "),
+            "args" -> names.map(a => conf.canonicalize(a.name)).map(toSQLId(_)).mkString(", "),
             "caseSensitiveConfig" -> toSQLConf(SQLConf.CASE_SENSITIVE.key)))
       }
 
@@ -124,11 +114,11 @@ object ResolveLambdaVariables extends Rule[LogicalPlan] {
       l
 
     case l: LambdaFunction if !l.hidden =>
-      val lambdaMap = l.arguments.map(v => canonicalizer(v.name) -> v).toMap
+      val lambdaMap = l.arguments.map(v => conf.canonicalize(v.name) -> v).toMap
       l.mapChildren(resolve(_, parentLambdaMap ++ lambdaMap))
 
     case u @ UnresolvedNamedLambdaVariable(name +: nestedFields) =>
-      parentLambdaMap.get(canonicalizer(name)) match {
+      parentLambdaMap.get(conf.canonicalize(name)) match {
         case Some(lambda) =>
           nestedFields.foldLeft(lambda: Expression) { (expr, fieldName) =>
             ExtractValue(expr, Literal(fieldName), conf.resolver)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -7220,6 +7220,20 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
     }
   }
 
+  /**
+   * Returns the lower case representation of a string if `caseSensitiveAnalysis` is enabled.
+   * Otherwise, returns the original string.
+   */
+  def canonicalize(s: String): String = {
+    if (!caseSensitiveAnalysis) {
+      // scalastyle:off caselocale
+      s.toLowerCase
+      // scalastyle:on caselocale
+    } else {
+      s
+    }
+  }
+
   /**
    * Returns the error handler for handling hint errors.
    */