From 413a65bbe6b87467a582dceddd300db8cd2c7e04 Mon Sep 17 00:00:00 2001
From: Stevo Mitric <stevo.mitric@databricks.com>
Date: Fri, 25 Oct 2024 12:29:56 +0200
Subject: [PATCH] [SPARK-50061][SQL] Enable analyze table for collated columns

### What changes were proposed in this pull request?
In this PR the `analyze table` command is enabled for collated strings. Current implementation collects stats based on the collation-aware `Aggregate` expression, so this PR only enables the aggregation.

### Why are the changes needed?
To enable `analyze table` command for collated strings.

### Does this PR introduce _any_ user-facing change?
Yes, currently doing:
```sql
ANALYZE TABLE test_table COMPUTE STATISTICS FOR COLUMNS c
```
where c is collated string, fails because of unsupported datatype. This PR addresses this issue and enables the command.

### How was this patch tested?
New test in this PR.

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #48586 from stevomitric/stevomitric/analyze-fix.

Authored-by: Stevo Mitric <stevo.mitric@databricks.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
---
 .../execution/command/AnalyzeColumnCommand.scala  |  2 +-
 .../sql/execution/command/CommandUtils.scala      |  2 +-
 .../spark/sql/StatisticsCollectionSuite.scala     | 15 +++++++++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
index 65a7a0ebbd916..23555c98135f6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/AnalyzeColumnCommand.scala
@@ -140,7 +140,7 @@ case class AnalyzeColumnCommand(
     case DoubleType | FloatType => true
     case BooleanType => true
     case _: DatetimeType => true
-    case BinaryType | StringType => true
+    case BinaryType | _: StringType => true
     case _ => false
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
index 91454c79df600..48d98c14c3889 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/CommandUtils.scala
@@ -411,7 +411,7 @@ object CommandUtils extends Logging {
       case DoubleType | FloatType => fixedLenTypeStruct
       case BooleanType => fixedLenTypeStruct
       case _: DatetimeType => fixedLenTypeStruct
-      case BinaryType | StringType =>
+      case BinaryType | _: StringType =>
         // For string and binary type, we don't compute min, max or histogram
         val nullLit = Literal(null, col.dataType)
         struct(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
index 948a0e3444cd1..8d7ada15381bf 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionSuite.scala
@@ -678,6 +678,21 @@ class StatisticsCollectionSuite extends StatisticsCollectionTestBase with Shared
     }
   }
 
+  test("analyze stats for collated strings") {
+    val tableName = "collated_strings"
+    Seq[String]("sr_CI").foreach { collation =>
+      withTable(tableName) {
+        sql(s"CREATE TABLE $tableName (c STRING COLLATE $collation) USING PARQUET")
+        sql(s"INSERT INTO $tableName VALUES ('a'), ('A')")
+        sql(s"ANALYZE TABLE $tableName COMPUTE STATISTICS FOR COLUMNS c")
+
+        val table = getCatalogTable(tableName)
+        assert(table.stats.get.colStats("c") ==
+          CatalogColumnStat(Some(1), None, None, Some(0), Some(1), Some(1)))
+      }
+    }
+  }
+
   test("analyzes table statistics in cached catalog view") {
     def getTableStats(tableName: String): Statistics = {
       spark.table(tableName).queryExecution.optimizedPlan.stats