From fc0be7ebace3aaf22954f1311532db5c33f4d8fa Mon Sep 17 00:00:00 2001 From: allisonwang-db Date: Fri, 18 Aug 2023 17:31:20 +0800 Subject: [PATCH] [SPARK-44853][PYTHON][DOCS] Refine docstring of DataFrame.columns property ### What changes were proposed in this pull request? This PR refines the docstring of `df.columns` and adds more examples. ### Why are the changes needed? To make PySpark documentation better. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? doctest Closes #42540 from allisonwang-db/spark-44853-refine-df-columns. Authored-by: allisonwang-db Signed-off-by: Ruifeng Zheng --- python/pyspark/sql/dataframe.py | 62 ++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 932c29910bb42..03aaee8f2ec05 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2084,7 +2084,10 @@ def dtypes(self) -> List[Tuple[str, str]]: @property def columns(self) -> List[str]: - """Returns all column names as a list. + """ + Retrieves the names of all columns in the :class:`DataFrame` as a list. + + The order of the column names in the list reflects their order in the DataFrame. .. versionadded:: 1.3.0 @@ -2094,14 +2097,65 @@ def columns(self) -> List[str]: Returns ------- list - List of column names. + List of column names in the DataFrame. Examples -------- + Example 1: Retrieve column names of a DataFrame + >>> df = spark.createDataFrame( - ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]) + ... [(14, "Tom", "CA"), (23, "Alice", "NY"), (16, "Bob", "TX")], + ... ["age", "name", "state"] + ... ) >>> df.columns - ['age', 'name'] + ['age', 'name', 'state'] + + Example 2: Using column names to project specific columns + + >>> selected_cols = [col for col in df.columns if col != "age"] + >>> df.select(selected_cols).show() + +-----+-----+ + | name|state| + +-----+-----+ + | Tom| CA| + |Alice| NY| + | Bob| TX| + +-----+-----+ + + Example 3: Checking if a specific column exists in a DataFrame + + >>> "state" in df.columns + True + >>> "salary" in df.columns + False + + Example 4: Iterating over columns to apply a transformation + + >>> import pyspark.sql.functions as f + >>> for col_name in df.columns: + ... df = df.withColumn(col_name, f.upper(f.col(col_name))) + >>> df.show() + +---+-----+-----+ + |age| name|state| + +---+-----+-----+ + | 14| TOM| CA| + | 23|ALICE| NY| + | 16| BOB| TX| + +---+-----+-----+ + + Example 5: Renaming columns and checking the updated column names + + >>> df = df.withColumnRenamed("name", "first_name") + >>> df.columns + ['age', 'first_name', 'state'] + + Example 6: Using the `columns` property to ensure two DataFrames have the + same columns before a union + + >>> df2 = spark.createDataFrame( + ... [(30, "Eve", "FL"), (40, "Sam", "WA")], ["age", "name", "location"]) + >>> df.columns == df2.columns + False """ return [f.name for f in self.schema.fields]