Skip to content

Commit

Permalink
[SPARK-50093][SQL] Collations that use ICU should have the version of…
Browse files Browse the repository at this point in the history
… the used ICU library

### What changes were proposed in this pull request?
Changing the versions of the collations that use ICU to be that of the currently used ICU library in spark.

### Why are the changes needed?
Changing the version of the ICU library should automatically change the versions of collations that depend on it. Also, even though the UTF8_LCASE doesn't use ICU for comparisons it still relies on ICU's case mappings between upper and lower  case letters and is for that reason included as well.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Added  new unit tests.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #48626 from stefankandic/changeCollationVersion.

Authored-by: Stefan Kandic <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
  • Loading branch information
stefankandic authored and MaxGekk committed Oct 24, 2024
1 parent 1985b9c commit e55a806
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,8 @@ public static class Collation {

/**
* Version of the collation. This is the version of the ICU library Collator.
* For non-ICU collations (e.g. UTF8 Binary) the version is set to "1.0".
* For UTF8 Binary the version is set to "1.0". For ICU collations and UTF8_LCASE
* (because it uses ICU mappings) the version is set to the version of the ICU library.
* When using ICU Collator this version is exposed through collator.getVersion().
* Whenever the collation is updated, the version should be updated as well or kept
* for backwards compatibility.
Expand Down Expand Up @@ -574,7 +575,7 @@ protected Collation buildCollation() {
PROVIDER_SPARK,
null,
comparator,
"1.0",
CollationSpecICU.ICU_VERSION,
hashFunction,
equalsFunction,
/* isUtf8BinaryType = */ true,
Expand All @@ -601,7 +602,7 @@ protected Collation buildCollation() {
PROVIDER_SPARK,
null,
comparator,
"1.0",
CollationSpecICU.ICU_VERSION,
hashFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* isUtf8BinaryType = */ false,
Expand Down Expand Up @@ -661,10 +662,16 @@ protected String normalizedCollationName() {
}

static List<CollationIdentifier> listCollations() {
CollationIdentifier UTF8_BINARY_COLLATION_IDENT =
new CollationIdentifier(PROVIDER_SPARK, UTF8_BINARY_COLLATION_NAME, "1.0");
CollationIdentifier UTF8_LCASE_COLLATION_IDENT =
new CollationIdentifier(PROVIDER_SPARK, UTF8_LCASE_COLLATION_NAME, "1.0");
CollationIdentifier UTF8_BINARY_COLLATION_IDENT = new CollationIdentifier(
PROVIDER_SPARK,
UTF8_BINARY_COLLATION_NAME,
CollationSpecICU.ICU_VERSION
);
CollationIdentifier UTF8_LCASE_COLLATION_IDENT = new CollationIdentifier(
PROVIDER_SPARK,
UTF8_LCASE_COLLATION_NAME,
CollationSpecICU.ICU_VERSION
);
return Arrays.asList(UTF8_BINARY_COLLATION_IDENT, UTF8_LCASE_COLLATION_IDENT);
}

Expand Down Expand Up @@ -739,9 +746,11 @@ private enum AccentSensitivity {
private static final Map<String, Integer> ICULocaleToId = new HashMap<>();

/**
* ICU library Collator version passed to `Collation` instance.
* ICU library version.
*/
private static final String ICU_COLLATOR_VERSION = "153.120.0.0";
private static final String ICU_VERSION = String.format("%d.%d",
VersionInfo.ICU_VERSION.getMajor(),
VersionInfo.ICU_VERSION.getMinor());

static {
ICULocaleMap.put("UNICODE", ULocale.ROOT);
Expand Down Expand Up @@ -987,7 +996,7 @@ protected Collation buildCollation() {
PROVIDER_ICU,
collator,
comparator,
ICU_COLLATOR_VERSION,
ICU_VERSION,
hashFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* isUtf8BinaryType = */ false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,35 @@ import org.apache.spark.sql.catalyst.util.CollationFactory._
import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}

class CollationFactorySuite extends AnyFunSuite with Matchers { // scalastyle:ignore funsuite

val currentIcuVersion: String = "75.1"

test("collationId stability") {
assert(INDETERMINATE_COLLATION_ID == -1)

assert(UTF8_BINARY_COLLATION_ID == 0)
val utf8Binary = fetchCollation(UTF8_BINARY_COLLATION_ID)
assert(utf8Binary.collationName == "UTF8_BINARY")
assert(utf8Binary.isUtf8BinaryType)
assert(utf8Binary.version == currentIcuVersion)

assert(UTF8_LCASE_COLLATION_ID == 1)
val utf8Lcase = fetchCollation(UTF8_LCASE_COLLATION_ID)
assert(utf8Lcase.collationName == "UTF8_LCASE")
assert(!utf8Lcase.isUtf8BinaryType)
assert(utf8Lcase.version == currentIcuVersion)

assert(UNICODE_COLLATION_ID == (1 << 29))
val unicode = fetchCollation(UNICODE_COLLATION_ID)
assert(unicode.collationName == "UNICODE")
assert(!unicode.isUtf8BinaryType)
assert(unicode.version == currentIcuVersion)

assert(UNICODE_CI_COLLATION_ID == ((1 << 29) | (1 << 17)))
val unicodeCi = fetchCollation(UNICODE_CI_COLLATION_ID)
assert(unicodeCi.collationName == "UNICODE_CI")
assert(!unicodeCi.isUtf8BinaryType)
assert(unicodeCi.version == currentIcuVersion)
}

test("UTF8_BINARY and ICU root locale collation names") {
Expand Down

0 comments on commit e55a806

Please sign in to comment.