-
Notifications
You must be signed in to change notification settings - Fork 28.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-49992][SQL] Default collation resolution for DDL and DML queries
### What changes were proposed in this pull request? This PR proposes not using session-level collation in DDL commands (create/alter view/table, add/replace columns). Also, resolution of default collation should happen in the analyzer and not in the parser. However, due to how we are checking for default string type (using reference equals with `StringType` object) we cannot just replace this object with `StringType("UTF8_BINARY")` because they compare as equal so the tree node framework will just return the old plan. Because of this we have to perform this resolution twice, once by changing the `StringType` object into a `TemporaryStringType` and then back to `StringType("UTF8_BINARY")` which is not considered a default string type anymore. Another thing is that the dependent rules `ResolveInlineTables` and `CollationTypeCoercion` are updated so that they don't execute if there are still unresolved string types in the plan. ### Why are the changes needed? The default collation for DDL commands should be tied to the object being created or altered (e.g., table, view, schema) rather than the session-level setting. Since object-level collations are not yet supported, we will assume the UTF8_BINARY collation by default for now. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added new unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48962 from stefankandic/fixSessionCollationOrder. Lead-authored-by: Stefan Kandic <[email protected]> Co-authored-by: Wenchen Fan <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
- Loading branch information
1 parent
aaf8590
commit b45045e
Showing
26 changed files
with
855 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
188 changes: 188 additions & 0 deletions
188
...yst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveDefaultStringTypes.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.catalyst.analysis | ||
|
||
import org.apache.spark.sql.catalyst.expressions.{Cast, Expression, Literal} | ||
import org.apache.spark.sql.catalyst.plans.logical.{AddColumns, AlterColumn, AlterViewAs, ColumnDefinition, CreateView, LogicalPlan, QualifiedColType, ReplaceColumns, V1CreateTablePlan, V2CreateTablePlan} | ||
import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} | ||
import org.apache.spark.sql.types.{DataType, StringType} | ||
|
||
/** | ||
* Resolves default string types in queries and commands. For queries, the default string type is | ||
* determined by the session's default string type. For DDL, the default string type is the | ||
* default type of the object (table -> schema -> catalog). However, this is not implemented yet. | ||
* So, we will just use UTF8_BINARY for now. | ||
*/ | ||
object ResolveDefaultStringTypes extends Rule[LogicalPlan] { | ||
def apply(plan: LogicalPlan): LogicalPlan = { | ||
val newPlan = apply0(plan) | ||
if (plan.ne(newPlan)) { | ||
// Due to how tree transformations work and StringType object being equal to | ||
// StringType("UTF8_BINARY"), we need to transform the plan twice | ||
// to ensure the correct results for occurrences of default string type. | ||
val finalPlan = apply0(newPlan) | ||
RuleExecutor.forceAdditionalIteration(finalPlan) | ||
finalPlan | ||
} else { | ||
newPlan | ||
} | ||
} | ||
|
||
private def apply0(plan: LogicalPlan): LogicalPlan = { | ||
if (isDDLCommand(plan)) { | ||
transformDDL(plan) | ||
} else { | ||
transformPlan(plan, sessionDefaultStringType) | ||
} | ||
} | ||
|
||
/** | ||
* Returns whether any of the given `plan` needs to have its | ||
* default string type resolved. | ||
*/ | ||
def needsResolution(plan: LogicalPlan): Boolean = { | ||
if (!isDDLCommand(plan) && isDefaultSessionCollationUsed) { | ||
return false | ||
} | ||
|
||
plan.exists(node => needsResolution(node.expressions)) | ||
} | ||
|
||
/** | ||
* Returns whether any of the given `expressions` needs to have its | ||
* default string type resolved. | ||
*/ | ||
def needsResolution(expressions: Seq[Expression]): Boolean = { | ||
expressions.exists(needsResolution) | ||
} | ||
|
||
/** | ||
* Returns whether the given `expression` needs to have its | ||
* default string type resolved. | ||
*/ | ||
def needsResolution(expression: Expression): Boolean = { | ||
expression.exists(e => transformExpression.isDefinedAt(e)) | ||
} | ||
|
||
private def isDefaultSessionCollationUsed: Boolean = conf.defaultStringType == StringType | ||
|
||
/** | ||
* Returns the default string type that should be used in a given DDL command (for now always | ||
* UTF8_BINARY). | ||
*/ | ||
private def stringTypeForDDLCommand(table: LogicalPlan): StringType = | ||
StringType("UTF8_BINARY") | ||
|
||
/** Returns the session default string type */ | ||
private def sessionDefaultStringType: StringType = | ||
StringType(conf.defaultStringType.collationId) | ||
|
||
private def isDDLCommand(plan: LogicalPlan): Boolean = plan exists { | ||
case _: AddColumns | _: ReplaceColumns | _: AlterColumn => true | ||
case _ => isCreateOrAlterPlan(plan) | ||
} | ||
|
||
private def isCreateOrAlterPlan(plan: LogicalPlan): Boolean = plan match { | ||
case _: V1CreateTablePlan | _: V2CreateTablePlan | _: CreateView | _: AlterViewAs => true | ||
case _ => false | ||
} | ||
|
||
private def transformDDL(plan: LogicalPlan): LogicalPlan = { | ||
val newType = stringTypeForDDLCommand(plan) | ||
|
||
plan resolveOperators { | ||
case p if isCreateOrAlterPlan(p) => | ||
transformPlan(p, newType) | ||
|
||
case addCols: AddColumns => | ||
addCols.copy(columnsToAdd = replaceColumnTypes(addCols.columnsToAdd, newType)) | ||
|
||
case replaceCols: ReplaceColumns => | ||
replaceCols.copy(columnsToAdd = replaceColumnTypes(replaceCols.columnsToAdd, newType)) | ||
|
||
case alter: AlterColumn | ||
if alter.dataType.isDefined && hasDefaultStringType(alter.dataType.get) => | ||
alter.copy(dataType = Some(replaceDefaultStringType(alter.dataType.get, newType))) | ||
} | ||
} | ||
|
||
/** | ||
* Transforms the given plan, by transforming all expressions in its operators to use the given | ||
* new type instead of the default string type. | ||
*/ | ||
private def transformPlan(plan: LogicalPlan, newType: StringType): LogicalPlan = { | ||
plan resolveExpressionsUp { expression => | ||
transformExpression | ||
.andThen(_.apply(newType)) | ||
.applyOrElse(expression, identity[Expression]) | ||
} | ||
} | ||
|
||
/** | ||
* Transforms the given expression, by changing all default string types to the given new type. | ||
*/ | ||
private def transformExpression: PartialFunction[Expression, StringType => Expression] = { | ||
case columnDef: ColumnDefinition if hasDefaultStringType(columnDef.dataType) => | ||
newType => columnDef.copy(dataType = replaceDefaultStringType(columnDef.dataType, newType)) | ||
|
||
case cast: Cast if hasDefaultStringType(cast.dataType) => | ||
newType => cast.copy(dataType = replaceDefaultStringType(cast.dataType, newType)) | ||
|
||
case Literal(value, dt) if hasDefaultStringType(dt) => | ||
newType => Literal(value, replaceDefaultStringType(dt, newType)) | ||
} | ||
|
||
private def hasDefaultStringType(dataType: DataType): Boolean = | ||
dataType.existsRecursively(isDefaultStringType) | ||
|
||
private def isDefaultStringType(dataType: DataType): Boolean = { | ||
dataType match { | ||
case st: StringType => | ||
// should only return true for StringType object and not StringType("UTF8_BINARY") | ||
st.eq(StringType) || st.isInstanceOf[TemporaryStringType] | ||
case _ => false | ||
} | ||
} | ||
|
||
private def replaceDefaultStringType(dataType: DataType, newType: StringType): DataType = { | ||
dataType.transformRecursively { | ||
case currentType: StringType if isDefaultStringType(currentType) => | ||
if (currentType == newType) { | ||
TemporaryStringType() | ||
} else { | ||
newType | ||
} | ||
} | ||
} | ||
|
||
private def replaceColumnTypes( | ||
colTypes: Seq[QualifiedColType], | ||
newType: StringType): Seq[QualifiedColType] = { | ||
colTypes.map { | ||
case colWithDefault if hasDefaultStringType(colWithDefault.dataType) => | ||
val replaced = replaceDefaultStringType(colWithDefault.dataType, newType) | ||
colWithDefault.copy(dataType = replaced) | ||
|
||
case col => col | ||
} | ||
} | ||
} | ||
|
||
case class TemporaryStringType() extends StringType(1) { | ||
override def toString: String = s"TemporaryStringType($collationId)" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.