From e55a80651db85d95a379ea1ff3af1955cc90696d Mon Sep 17 00:00:00 2001 From: Stefan Kandic Date: Thu, 24 Oct 2024 15:09:25 +0200 Subject: [PATCH] [SPARK-50093][SQL] Collations that use ICU should have the version of the used ICU library ### What changes were proposed in this pull request? Changing the versions of the collations that use ICU to be that of the currently used ICU library in spark. ### Why are the changes needed? Changing the version of the ICU library should automatically change the versions of collations that depend on it. Also, even though the UTF8_LCASE doesn't use ICU for comparisons it still relies on ICU's case mappings between upper and lower case letters and is for that reason included as well. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added new unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48626 from stefankandic/changeCollationVersion. Authored-by: Stefan Kandic Signed-off-by: Max Gekk --- .../sql/catalyst/util/CollationFactory.java | 29 ++++++++++++------- .../unsafe/types/CollationFactorySuite.scala | 7 +++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 4a61e630fef39..1305d82bcd785 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -113,7 +113,8 @@ public static class Collation { /** * Version of the collation. This is the version of the ICU library Collator. - * For non-ICU collations (e.g. UTF8 Binary) the version is set to "1.0". + * For UTF8 Binary the version is set to "1.0". For ICU collations and UTF8_LCASE + * (because it uses ICU mappings) the version is set to the version of the ICU library. * When using ICU Collator this version is exposed through collator.getVersion(). * Whenever the collation is updated, the version should be updated as well or kept * for backwards compatibility. @@ -574,7 +575,7 @@ protected Collation buildCollation() { PROVIDER_SPARK, null, comparator, - "1.0", + CollationSpecICU.ICU_VERSION, hashFunction, equalsFunction, /* isUtf8BinaryType = */ true, @@ -601,7 +602,7 @@ protected Collation buildCollation() { PROVIDER_SPARK, null, comparator, - "1.0", + CollationSpecICU.ICU_VERSION, hashFunction, (s1, s2) -> comparator.compare(s1, s2) == 0, /* isUtf8BinaryType = */ false, @@ -661,10 +662,16 @@ protected String normalizedCollationName() { } static List listCollations() { - CollationIdentifier UTF8_BINARY_COLLATION_IDENT = - new CollationIdentifier(PROVIDER_SPARK, UTF8_BINARY_COLLATION_NAME, "1.0"); - CollationIdentifier UTF8_LCASE_COLLATION_IDENT = - new CollationIdentifier(PROVIDER_SPARK, UTF8_LCASE_COLLATION_NAME, "1.0"); + CollationIdentifier UTF8_BINARY_COLLATION_IDENT = new CollationIdentifier( + PROVIDER_SPARK, + UTF8_BINARY_COLLATION_NAME, + CollationSpecICU.ICU_VERSION + ); + CollationIdentifier UTF8_LCASE_COLLATION_IDENT = new CollationIdentifier( + PROVIDER_SPARK, + UTF8_LCASE_COLLATION_NAME, + CollationSpecICU.ICU_VERSION + ); return Arrays.asList(UTF8_BINARY_COLLATION_IDENT, UTF8_LCASE_COLLATION_IDENT); } @@ -739,9 +746,11 @@ private enum AccentSensitivity { private static final Map ICULocaleToId = new HashMap<>(); /** - * ICU library Collator version passed to `Collation` instance. + * ICU library version. */ - private static final String ICU_COLLATOR_VERSION = "153.120.0.0"; + private static final String ICU_VERSION = String.format("%d.%d", + VersionInfo.ICU_VERSION.getMajor(), + VersionInfo.ICU_VERSION.getMinor()); static { ICULocaleMap.put("UNICODE", ULocale.ROOT); @@ -987,7 +996,7 @@ protected Collation buildCollation() { PROVIDER_ICU, collator, comparator, - ICU_COLLATOR_VERSION, + ICU_VERSION, hashFunction, (s1, s2) -> comparator.compare(s1, s2) == 0, /* isUtf8BinaryType = */ false, diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala index df9af1579d4f1..6daaf2a4c6759 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/CollationFactorySuite.scala @@ -32,6 +32,9 @@ import org.apache.spark.sql.catalyst.util.CollationFactory._ import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8} class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite + + val currentIcuVersion: String = "75.1" + test("collationId stability") { assert(INDETERMINATE_COLLATION_ID == -1) @@ -39,21 +42,25 @@ class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ig val utf8Binary = fetchCollation(UTF8_BINARY_COLLATION_ID) assert(utf8Binary.collationName == "UTF8_BINARY") assert(utf8Binary.isUtf8BinaryType) + assert(utf8Binary.version == currentIcuVersion) assert(UTF8_LCASE_COLLATION_ID == 1) val utf8Lcase = fetchCollation(UTF8_LCASE_COLLATION_ID) assert(utf8Lcase.collationName == "UTF8_LCASE") assert(!utf8Lcase.isUtf8BinaryType) + assert(utf8Lcase.version == currentIcuVersion) assert(UNICODE_COLLATION_ID == (1 << 29)) val unicode = fetchCollation(UNICODE_COLLATION_ID) assert(unicode.collationName == "UNICODE") assert(!unicode.isUtf8BinaryType) + assert(unicode.version == currentIcuVersion) assert(UNICODE_CI_COLLATION_ID == ((1 << 29) | (1 << 17))) val unicodeCi = fetchCollation(UNICODE_CI_COLLATION_ID) assert(unicodeCi.collationName == "UNICODE_CI") assert(!unicodeCi.isUtf8BinaryType) + assert(unicodeCi.version == currentIcuVersion) } test("UTF8_BINARY and ICU root locale collation names") {