[SPARK-44853][PYTHON][DOCS] Refine docstring of DataFrame.columns pro…

…perty ### What changes were proposed in this pull request? This PR refines the docstring of `df.columns` and adds more examples. ### Why are the changes needed? To make PySpark documentation better. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? doctest Closes #42540 from allisonwang-db/spark-44853-refine-df-columns. Authored-by: allisonwang-db <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
apache · Aug 18, 2023 · fc0be7e · fc0be7e
1 parent bb41cd8
commit fc0be7e
Showing 1 changed file with 58 additions and 4 deletions.
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -2084,7 +2084,10 @@ def dtypes(self) -> List[Tuple[str, str]]:
 
     @property
     def columns(self) -> List[str]:
-        """Returns all column names as a list.
+        """
+        Retrieves the names of all columns in the :class:`DataFrame` as a list.
+
+        The order of the column names in the list reflects their order in the DataFrame.
 
         .. versionadded:: 1.3.0
 
@@ -2094,14 +2097,65 @@ def columns(self) -> List[str]:
         Returns
         -------
         list
-            List of column names.
+            List of column names in the DataFrame.
 
         Examples
         --------
+        Example 1: Retrieve column names of a DataFrame
+
         >>> df = spark.createDataFrame(
-        ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
+        ...     [(14, "Tom", "CA"), (23, "Alice", "NY"), (16, "Bob", "TX")],
+        ...     ["age", "name", "state"]
+        ... )
         >>> df.columns
-        ['age', 'name']
+        ['age', 'name', 'state']
+
+        Example 2: Using column names to project specific columns
+
+        >>> selected_cols = [col for col in df.columns if col != "age"]
+        >>> df.select(selected_cols).show()
+        +-----+-----+
+        | name|state|
+        +-----+-----+
+        |  Tom|   CA|
+        |Alice|   NY|
+        |  Bob|   TX|
+        +-----+-----+
+
+        Example 3: Checking if a specific column exists in a DataFrame
+
+        >>> "state" in df.columns
+        True
+        >>> "salary" in df.columns
+        False
+
+        Example 4: Iterating over columns to apply a transformation
+
+        >>> import pyspark.sql.functions as f
+        >>> for col_name in df.columns:
+        ...     df = df.withColumn(col_name, f.upper(f.col(col_name)))
+        >>> df.show()
+        +---+-----+-----+
+        |age| name|state|
+        +---+-----+-----+
+        | 14|  TOM|   CA|
+        | 23|ALICE|   NY|
+        | 16|  BOB|   TX|
+        +---+-----+-----+
+
+        Example 5: Renaming columns and checking the updated column names
+
+        >>> df = df.withColumnRenamed("name", "first_name")
+        >>> df.columns
+        ['age', 'first_name', 'state']
+
+        Example 6: Using the `columns` property to ensure two DataFrames have the
+        same columns before a union
+
+        >>> df2 = spark.createDataFrame(
+        ...     [(30, "Eve", "FL"), (40, "Sam", "WA")], ["age", "name", "location"])
+        >>> df.columns == df2.columns
+        False
         """
         return [f.name for f in self.schema.fields]