From b57d863afda1dd78c3bdea8fcb02a3bd55cb137f Mon Sep 17 00:00:00 2001 From: panbingkun Date: Thu, 22 Aug 2024 18:20:48 +0200 Subject: [PATCH] [SPARK-49353][SQL] Update docs related to `UTF-32` encoding/decoding ### What changes were proposed in this pull request? The pr aims to update the related docs after `encoding` and `decoding` support `UTF-32`, includes: - the `doc` of the sql config `spark.sql.legacy.javaCharsets` - connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala - sql/core/src/main/scala/org/apache/spark/sql/functions.scala - python/pyspark/sql/functions/builtin.py ### Why are the changes needed? After the pr https://github.com/apache/spark/pull/46469, `UTF-32` for string encoding and decoding is already supported, but some related documents have not been updated synchronously. Let's update it to avoid misunderstandings for end-users and developers. https://github.com/apache/spark/blob/e93c5fbe81d21f8bf2ce52867013d06a63c7956e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/CharsetProvider.scala#L26 ### Does this PR introduce _any_ user-facing change? Yes, fix doc. ### How was this patch tested? Nope, only fixed some docs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #47844 from panbingkun/SPARK-49353. Authored-by: panbingkun Signed-off-by: Max Gekk --- .../src/main/scala/org/apache/spark/sql/functions.scala | 8 ++++---- python/pyspark/sql/functions/builtin.py | 4 ++-- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 3 ++- .../src/main/scala/org/apache/spark/sql/functions.scala | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index c0bf9c9d013ca..3b6675362d556 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -3840,8 +3840,8 @@ object functions { /** * Computes the first argument into a string from a binary using the provided character set (one - * of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). If either argument - * is null, the result will also be null. + * of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). If either + * argument is null, the result will also be null. * * @group string_funcs * @since 3.4.0 @@ -3851,8 +3851,8 @@ object functions { /** * Computes the first argument into a binary from a string using the provided character set (one - * of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). If either argument - * is null, the result will also be null. + * of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). If either + * argument is null, the result will also be null. * * @group string_funcs * @since 3.4.0 diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 24b8ae82e99ad..387a039758f1e 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -10989,7 +10989,7 @@ def concat_ws(sep: str, *cols: "ColumnOrName") -> Column: def decode(col: "ColumnOrName", charset: str) -> Column: """ Computes the first argument into a string from a binary using the provided character set - (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). .. versionadded:: 1.5.0 @@ -11027,7 +11027,7 @@ def decode(col: "ColumnOrName", charset: str) -> Column: def encode(col: "ColumnOrName", charset: str) -> Column: """ Computes the first argument into a binary from a string using the provided character set - (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). .. versionadded:: 1.5.0 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index e3f3350ed636e..a2bc56a73bc4e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -5090,7 +5090,8 @@ object SQLConf { .internal() .doc("When set to true, the functions like `encode()` can use charsets from JDK while " + "encoding or decoding string values. If it is false, such functions support only one of " + - "the charsets: 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'.") + "the charsets: 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', " + + "'UTF-32'.") .version("4.0.0") .booleanConf .createWithDefault(false) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index be83444a8fd33..62315123a858c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -3752,7 +3752,7 @@ object functions { /** * Computes the first argument into a string from a binary using the provided character set - * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). * If either argument is null, the result will also be null. * * @group string_funcs @@ -3763,7 +3763,7 @@ object functions { /** * Computes the first argument into a binary from a string using the provided character set - * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). + * (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16', 'UTF-32'). * If either argument is null, the result will also be null. * * @group string_funcs