From 311a855884ca3dfa5511de106efce262989f8f26 Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Tue, 27 Sep 2022 16:17:21 +0800
Subject: [PATCH] [MINOR] Clarify that xxhash64 seed is 42

### What changes were proposed in this pull request?

State that the hash seed used for xxhash64 is 42 in docs.

### Why are the changes needed?

It's somewhat non-standard not seed to 0. Users would have to know this seed to reproduce the hash value.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

N/A

Closes #38010 from srowen/Hash42.

Authored-by: Sean Owen <srowen@gmail.com>
Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
---
 R/pkg/R/functions.R                                            | 2 +-
 python/pyspark/sql/functions.py                                | 2 +-
 .../scala/org/apache/spark/sql/catalyst/expressions/hash.scala | 3 ++-
 sql/core/src/main/scala/org/apache/spark/sql/functions.scala   | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 00e2bec670adc..8bf7b9c8fc98c 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -974,7 +974,7 @@ setMethod("hash",
 #' @details
 #' \code{xxhash64}: Calculates the hash code of given columns using the 64-bit
 #' variant of the xxHash algorithm, and returns the result as a long
-#' column.
+#' column. The hash computation uses an initial seed of 42.
 #'
 #' @rdname column_misc_functions
 #' @aliases xxhash64 xxhash64,Column-method
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index 38baf9b99135d..9b4b5f15e9e9f 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -5092,7 +5092,7 @@ def hash(*cols: "ColumnOrName") -> Column:
 
 def xxhash64(*cols: "ColumnOrName") -> Column:
     """Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm,
-    and returns the result as a long column.
+    and returns the result as a long column. The hash computation uses an initial seed of 42.
 
     .. versionadded:: 3.0.0
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 3daf536993879..7ac486f05af1b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -643,7 +643,8 @@ object Murmur3HashFunction extends InterpretedHashFunction {
  * A xxHash64 64-bit hash expression.
  */
 @ExpressionDescription(
-  usage = "_FUNC_(expr1, expr2, ...) - Returns a 64-bit hash value of the arguments.",
+  usage = "_FUNC_(expr1, expr2, ...) - Returns a 64-bit hash value of the arguments. " +
+    "Hash seed is 42.",
   examples = """
     Examples:
       > SELECT _FUNC_('Spark', array(123), 2);
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 0fdc0038a2947..f5d284f143f7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -2569,7 +2569,7 @@ object functions {
   /**
    * Calculates the hash code of given columns using the 64-bit
    * variant of the xxHash algorithm, and returns the result as a long
-   * column.
+   * column. The hash computation uses an initial seed of 42.
    *
    * @group misc_funcs
    * @since 3.0.0