From 311a855884ca3dfa5511de106efce262989f8f26 Mon Sep 17 00:00:00 2001 From: Sean Owen Date: Tue, 27 Sep 2022 16:17:21 +0800 Subject: [PATCH] [MINOR] Clarify that xxhash64 seed is 42 ### What changes were proposed in this pull request? State that the hash seed used for xxhash64 is 42 in docs. ### Why are the changes needed? It's somewhat non-standard not seed to 0. Users would have to know this seed to reproduce the hash value. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? N/A Closes #38010 from srowen/Hash42. Authored-by: Sean Owen Signed-off-by: Ruifeng Zheng --- R/pkg/R/functions.R | 2 +- python/pyspark/sql/functions.py | 2 +- .../scala/org/apache/spark/sql/catalyst/expressions/hash.scala | 3 ++- sql/core/src/main/scala/org/apache/spark/sql/functions.scala | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 00e2bec670adc..8bf7b9c8fc98c 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -974,7 +974,7 @@ setMethod("hash", #' @details #' \code{xxhash64}: Calculates the hash code of given columns using the 64-bit #' variant of the xxHash algorithm, and returns the result as a long -#' column. +#' column. The hash computation uses an initial seed of 42. #' #' @rdname column_misc_functions #' @aliases xxhash64 xxhash64,Column-method diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 38baf9b99135d..9b4b5f15e9e9f 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -5092,7 +5092,7 @@ def hash(*cols: "ColumnOrName") -> Column: def xxhash64(*cols: "ColumnOrName") -> Column: """Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm, - and returns the result as a long column. + and returns the result as a long column. The hash computation uses an initial seed of 42. .. versionadded:: 3.0.0 diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala index 3daf536993879..7ac486f05af1b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala @@ -643,7 +643,8 @@ object Murmur3HashFunction extends InterpretedHashFunction { * A xxHash64 64-bit hash expression. */ @ExpressionDescription( - usage = "_FUNC_(expr1, expr2, ...) - Returns a 64-bit hash value of the arguments.", + usage = "_FUNC_(expr1, expr2, ...) - Returns a 64-bit hash value of the arguments. " + + "Hash seed is 42.", examples = """ Examples: > SELECT _FUNC_('Spark', array(123), 2); diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 0fdc0038a2947..f5d284f143f7f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -2569,7 +2569,7 @@ object functions { /** * Calculates the hash code of given columns using the 64-bit * variant of the xxHash algorithm, and returns the result as a long - * column. + * column. The hash computation uses an initial seed of 42. * * @group misc_funcs * @since 3.0.0