apache · allisonwang-db · Aug 1, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 9, 2023
diff --git a/examples/src/main/python/sql/udtf.py b/examples/src/main/python/sql/udtf.py
@@ -0,0 +1,169 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+A simple example demonstrating Python UDTFs in Spark
+Run with:
+  ./bin/spark-submit examples/src/main/python/sql/udtf.py
+"""
+
+# NOTE that this file is imported in user guide in PySpark documentation.
+# The codes are referred via line numbers. See also `literalinclude` directive in Sphinx.
+import pandas as pd
+from typing import Iterator, Any
+
+from pyspark.sql import SparkSession
+from pyspark.sql.pandas.utils import require_minimum_pandas_version, require_minimum_pyarrow_version
+
+# Python UDTFs use Arrow by default.
+require_minimum_pandas_version()
+require_minimum_pyarrow_version()
+
+
+def python_udtf_simple_example(spark: SparkSession):
+
+    from pyspark.sql.functions import lit, udtf
+
+    class SimpleUDTF:
+        def eval(self, x: int, y: int):
+            yield x + y, x - y
+
+    # Now, create a Python UDTF using the defined class and specify a return type
+    func = udtf(SimpleUDTF, returnType="c1: int, c2: int")
+
+    func(lit(1), lit(2)).show()
+    # +---+---+
+    # | c1| c2|
+    # +---+---+
+    # |  3| -1|
+    # +---+---+
+
+
+def python_udtf_registration(spark: SparkSession):
+
+    from pyspark.sql.functions import udtf
+
+    # Use the decorator to define the UDTF.
+    @udtf(returnType="c1: int, c2: int")
+    class PlusOne:
+        def eval(self, x: int):
+            yield x, x + 1
+
+    # Register the UDTF
+    spark.udtf.register("plus_one", PlusOne)
+
+    # Use the UDTF in SQL
+    spark.sql("SELECT * FROM plus_one(1)").show()
+    # +---+---+
+    # | c1| c2|
+    # +---+---+
+    # |  1|  2|
+    # +---+---+
+
+    # Use the UDTF in SQL with lateral join
+    spark.sql("SELECT * FROM VALUES (0, 1), (1, 2) t(x, y), LATERAL plus_one(x)").show()
+    # +---+---+---+---+
+    # |  x|  y| c1| c2|
+    # +---+---+---+---+
+    # |  0|  1|  0|  1|
+    # |  1|  2|  1|  2|
+    # +---+---+---+---+
+
+
+def python_udtf_terminate_example(spark: SparkSession):
+
+    from pyspark.sql.functions import udtf
+
+    @udtf(returnType="cnt: int")
+    class CountUDTF:
+        def __init__(self):
+            self.count = 0
+
+        def eval(self, x):
+            self.count += 1
+
+        def terminate(self):
+            yield self.count,
+
+
+    spark.udtf.register("count_udtf", CountUDTF)
+    spark.sql("SELECT * FROM range(0, 10, 1, 1), LATERAL count_udtf(id)")
+    # +---+---+
+    # | id|cnt|
+    # +---+---+
+    # |  9| 10|
+    # +---+---+
+
+
+def python_udtf_calendar_example(spark: SparkSession):
+
+    import datetime
+    from pyspark.sql.functions import udtf
+
+    @udtf(returnType="date: string, year: int, month: int, day: int, day_of_week: string")
+    class Calendar:
+        def eval(self, start_date: str, end_date: str):
+            start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d").date()
+            end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d").date()
+            delta = end_date - start_date
+            dates = [start_date + datetime.timedelta(days=i) for i in range(delta.days + 1)]
+            for date in dates:
+                date_str = date.strftime("%Y-%m-%d")
+                yield (
+                    date.strftime("%Y-%m-%d"),
+                    date.year,
+                    date.month,
+                    date.day,
+                    date.strftime("%A"),
+                )
+
+    Calendar(lit("2023-01-01"), lit("2023-01-10")).show()
+    # +----------+----+-----+---+-----------+
+    # |      date|year|month|day|day_of_week|
+    # +----------+----+-----+---+-----------+
+    # |2023-01-01|2023|    1|  1|     Sunday|
+    # |2023-01-02|2023|    1|  2|     Monday|
+    # |2023-01-03|2023|    1|  3|    Tuesday|
+    # |2023-01-04|2023|    1|  4|  Wednesday|
+    # |2023-01-05|2023|    1|  5|   Thursday|
+    # |2023-01-06|2023|    1|  6|     Friday|
+    # |2023-01-07|2023|    1|  7|   Saturday|
+    # |2023-01-08|2023|    1|  8|     Sunday|
+    # |2023-01-09|2023|    1|  9|     Monday|
+    # |2023-01-10|2023|    1| 10|    Tuesday|
+    # +----------+----+-----+---+-----------+
+
+
+if __name__ == "__main__":
+    spark = SparkSession \
+        .builder \
+        .appName("Python UDTF example") \
+        .getOrCreate()
+
+    print("Running simple Python UDTF example")
+    python_udtf_simple_example(spark)
+
+    print("Running Python UDTF registration example")
+    python_udtf_registration(spark)
+
+    print("Running Python UDTF terminate example")
+    python_udtf_terminate_example(spark)
+
+    print("Running Python UDTF calendar example")
+    python_udtf_calendar_example(spark)
+
+    spark.stop()
diff --git a/python/docs/source/user_guide/sql/index.rst b/python/docs/source/user_guide/sql/index.rst
@@ -24,4 +24,5 @@ Spark SQL
    :maxdepth: 2
 
    arrow_pandas
+   python_udtf
 
diff --git a/python/docs/source/user_guide/sql/python_udtf.rst b/python/docs/source/user_guide/sql/python_udtf.rst
@@ -0,0 +1,140 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+===========================================
+Python User-defined Table Functions (UDTFs)
+===========================================
+
+Spark 3.5 introduces a new type of user-defined fucntion: Python user-defined table functions (UDTFs),
+which take zero or more arguments and return a set of rows.
-which take zero or more arguments and return a set of rows.
+wherein each invocation appears in the FROM clause and returns an entire
+relation as output instead of a single result value. Every UDTF call accepts
+zero or more arguments each comprising either a scalar constant expression or
+a separate input relation.
-which take zero or more arguments and return a set of rows.
+wherein each invocation appears in the FROM clause and returns an entire
+relation as output instead of a single result value. Every UDTF call accepts
+zero or more arguments each comprising either a scalar constant expression or
+a separate input relation.
+
+Implementing a Python UDTF
+--------------------------
+
+.. currentmodule:: pyspark.sql.functions
+
+To implement a Python UDTF, you can implement this class:
-To implement a Python UDTF, you can implement this class:
+To implement a Python UDTF, you can define a class implementing these methods:
-To implement a Python UDTF, you can implement this class:
+To implement a Python UDTF, you can define a class implementing these methods:
+
+.. code-block:: python
+
+    class PythonUDTF:
+
+        def __init__(self) -> None:
+            """
+            Initialize the user-defined table function (UDTF).
+
+            This method is optional to implement and is called once when the UDTF is
+            instantiated. Use it to perform any initialization required for the UDTF.
+            """
+            ...
+
+        def eval(self, *args: Any) -> Iterator[Any]:
+            """"
+            Evaluate the function using the given input arguments.
+
+            This method is required to implement.
+
+            Args:
+                *args: Arbitrary positional arguments representing the input
+                       to the UDTF.
+
+            Yields:
+                tuple: A tuple representing a single row in the UDTF result relation.
+                       Yield thisas many times as needed to produce multiple rows.
+
+            Note:
+                - The result must be a tuple.
+                - UDTFs do not accept keyword arguments on the calling side.
+                - Use "yield" to produce one row at a time for the UDTF result relation,
+                  or use "return" to produce multiple rows for the UDTF result relation at once.
+
+            Example:
+                def eval(self, x: int, y: int):
+                    yield x + y, x - y
+            """
+            ...
+
+        def terminate(self) -> Iterator[Any]:
+            """
+            Called when the UDTF has processed all rows in a partition.
+
+            This method is optional to implement and is useful for performing any
+            cleanup or finalization operations after the UDTF has processed all rows.
+            You can also yield additional rows if needed.
+
+            Yields:
+                tuple: A tuple representing a single row in the UDTF result relation.
+                       Yield this if you want to return additional rows during termination.
+
+            Example:
+                def terminate(self):
+                    yield "done", None
+            """
+            ...
+
+
+The return type of the UDTF must be either a ``StructType`` or a DDL string, both of which 
+define the schema of the UDTF output.
+
+Here's a simple example of a UDTF implementation:
+
+.. literalinclude:: ../../../../../examples/src/main/python/sql/udtf.py
+    :language: python
+    :lines: 39-53
+    :dedent: 4
+
+
+For more detailed usage, please see :func:`udtf`.
+
+
+Registering and Using Python UDTFs in SQL
+-----------------------------------------
+
+Python UDTFs can also be registered and used in SQL queries.
+
+.. literalinclude:: ../../../../../examples/src/main/python/sql/udtf.py
+    :language: python
+    :lines: 58-84
+    :dedent: 4
+
+
+Apache Arrow
+------------
+Apache Arrow is an in-memory columnar data format that is used in Spark to efficiently transfer
+data between JVM and Python processes. Apache Arrow is by default enabled for Python UDTFs.
+You can set ``spark.sql.execution.pythonUDTF.arrow.enabled`` to ``false`` to disable Arrow optimization.
+
+For more details, please see `Apache Arrow in PySpark <../arrow_pandas.rst>`_.
+
+
+More Examples
+-------------
+
+A Python UDTF with `__init__` and `terminate`:
+
+.. literalinclude:: ../../../../../examples/src/main/python/sql/udtf.py
+    :language: python
+    :lines: 89-109
+    :dedent: 4
+
+
+A Python UDTF to generate a list of dates:
+
+.. literalinclude:: ../../../../../examples/src/main/python/sql/udtf.py
+    :language: python
+    :lines: 114-148
+    :dedent: 4