From 413a65bbe6b87467a582dceddd300db8cd2c7e04 Mon Sep 17 00:00:00 2001 From: Stevo Mitric Date: Fri, 25 Oct 2024 12:29:56 +0200 Subject: [PATCH] [SPARK-50061][SQL] Enable analyze table for collated columns ### What changes were proposed in this pull request? In this PR the `analyze table` command is enabled for collated strings. Current implementation collects stats based on the collation-aware `Aggregate` expression, so this PR only enables the aggregation. ### Why are the changes needed? To enable `analyze table` command for collated strings. ### Does this PR introduce _any_ user-facing change? Yes, currently doing: ```sql ANALYZE TABLE test_table COMPUTE STATISTICS FOR COLUMNS c ``` where c is collated string, fails because of unsupported datatype. This PR addresses this issue and enables the command. ### How was this patch tested? New test in this PR. ### Was this patch authored or co-authored using generative AI tooling? No Closes #48586 from stevomitric/stevomitric/analyze-fix. Authored-by: Stevo Mitric Signed-off-by: Max Gekk --- .../execution/command/AnalyzeColumnCommand.scala | 2 +- .../sql/execution/command/CommandUtils.scala | 2 +- .../spark/sql/StatisticsCollectionSuite.scala | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala index 65a7a0ebbd916..23555c98135f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala @@ -140,7 +140,7 @@ case class AnalyzeColumnCommand( case DoubleType | FloatType => true case BooleanType => true case _: DatetimeType => true - case BinaryType | StringType => true + case BinaryType | _: StringType => true case _ => false } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala index 91454c79df600..48d98c14c3889 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala @@ -411,7 +411,7 @@ object CommandUtils extends Logging { case DoubleType | FloatType => fixedLenTypeStruct case BooleanType => fixedLenTypeStruct case _: DatetimeType => fixedLenTypeStruct - case BinaryType | StringType => + case BinaryType | _: StringType => // For string and binary type, we don't compute min, max or histogram val nullLit = Literal(null, col.dataType) struct( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala index 948a0e3444cd1..8d7ada15381bf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala @@ -678,6 +678,21 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared } } + test("analyze stats for collated strings") { + val tableName = "collated_strings" + Seq[String]("sr_CI").foreach { collation => + withTable(tableName) { + sql(s"CREATE TABLE $tableName (c STRING COLLATE $collation) USING PARQUET") + sql(s"INSERT INTO $tableName VALUES ('a'), ('A')") + sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS c") + + val table = getCatalogTable(tableName) + assert(table.stats.get.colStats("c") == + CatalogColumnStat(Some(1), None, None, Some(0), Some(1), Some(1))) + } + } + } + test("analyzes table statistics in cached catalog view") { def getTableStats(tableName: String): Statistics = { spark.table(tableName).queryExecution.optimizedPlan.stats