[MINOR][PYTHON][DOCS] Fix the type hint of histogram_numeric

### What changes were proposed in this pull request? Fix the type hint of `histogram_numeric` ### Why are the changes needed? argument `nBins` cannot be a column name: ``` In [11]: spark.range(100).withColumn("nb", sf.lit(5)).select(sf.histogram_numeric('id', "nb")).show(truncate=False) ... AnalysisException: [DATATYPE_MISMATCH.NON_FOLDABLE_INPUT] Cannot resolve "histogram_numeric(id, nb)" due to data type mismatch: the input `nb` should be a foldable "INT" expression; however, got "nb". SQLSTATE: 42K09; 'Aggregate [unresolvedalias(histogram_numeric(id#323L, nb#324, 0, 0))] +- Project [id#323L, 5 AS nb#324] +- Range (0, 100, step=1, splits=Some(12)) ``` ### Does this PR introduce _any_ user-facing change? doc-only ### How was this patch tested? updated doctest ### Was this patch authored or co-authored using generative AI tooling? no Closes #48875 from zhengruifeng/fix_histogram_numeric. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
apache · Nov 18, 2024 · 05750de · 05750de
1 parent 8b2d032
commit 05750de
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 15 deletions.
diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py
@@ -1552,7 +1552,7 @@ def count_if(col: "ColumnOrName") -> Column:
 count_if.__doc__ = pysparkfuncs.count_if.__doc__
 
 
-def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column:
+def histogram_numeric(col: "ColumnOrName", nBins: Column) -> Column:
     return _invoke_function_over_columns("histogram_numeric", col, nBins)
 
 

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -8557,7 +8557,7 @@ def count_if(col: "ColumnOrName") -> Column:
 
 
 @_try_remote_functions
-def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column:
+def histogram_numeric(col: "ColumnOrName", nBins: Column) -> Column:
     """Computes a histogram on numeric 'col' using nb bins.
     The return value is an array of (x,y) pairs representing the centers of the
     histogram's bins. As the value of 'nb' is increased, the histogram approximation
@@ -8573,9 +8573,9 @@ def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column:
 
     Parameters
     ----------
-    col : :class:`~pyspark.sql.Column` or str
+    col : :class:`~pyspark.sql.Column` or column name
         target column to work on.
-    nBins : :class:`~pyspark.sql.Column` or str
+    nBins : :class:`~pyspark.sql.Column`
         number of Histogram columns.
 
     Returns
@@ -8585,17 +8585,14 @@ def histogram_numeric(col: "ColumnOrName", nBins: "ColumnOrName") -> Column:
 
     Examples
     --------
-    >>> df = spark.createDataFrame([("a", 1),
-    ...                             ("a", 2),
-    ...                             ("a", 3),
-    ...                             ("b", 8),
-    ...                             ("b", 2)], ["c1", "c2"])
-    >>> df.select(histogram_numeric('c2', lit(5))).show()
-    +------------------------+
-    |histogram_numeric(c2, 5)|
-    +------------------------+
-    |    [{1, 1.0}, {2, 1....|
-    +------------------------+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.range(100, numPartitions=1)
+    >>> df.select(sf.histogram_numeric('id', sf.lit(5))).show(truncate=False)
+    +-----------------------------------------------------------+
+    |histogram_numeric(id, 5)                                   |
+    +-----------------------------------------------------------+
+    |[{11, 25.0}, {36, 24.0}, {59, 23.0}, {84, 25.0}, {98, 3.0}]|
+    +-----------------------------------------------------------+
     """
     return _invoke_function_over_columns("histogram_numeric", col, nBins)