From 812a9ad3ea92b972fa8d966069ffd394b5c46ac1 Mon Sep 17 00:00:00 2001 From: Kaz Date: Wed, 4 Dec 2024 17:41:07 +0100 Subject: [PATCH] [MINOR] Fix some typos ### What changes were proposed in this pull request? This PR fixes typos in docstrings and comments (and a few functional bits of code). The typos were found using the typos software: https://github.com/crate-ci/typos I've made a typo-fix-PR before, but haven't seen the result on the website yet. Is there anything else I need to do for that? ### Why are the changes needed? Nice to fix :) ### Does this PR introduce _any_ user-facing change? yes, documentation was updated. ### How was this patch tested? No tests added. ### Was this patch authored or co-authored using generative AI tooling? No Closes #48557 from KazMiddelhoek/master. Lead-authored-by: Kaz Co-authored-by: Kaz <62500382+KazMiddelhoek@users.noreply.github.com> Signed-off-by: Max Gekk --- .github/workflows/update_build_status.yml | 2 +- R/pkg/R/functions.R | 4 ++-- R/pkg/R/serialize.R | 2 +- .../spark/network/shuffle/RemoteBlockPushResolver.java | 6 +++--- connector/connect/docs/client-connection-string.md | 4 ++-- docs/_plugins/include_example.rb | 4 ++-- docs/core-migration-guide.md | 2 +- docs/running-on-yarn.md | 2 +- docs/security.md | 2 +- docs/spark-standalone.md | 2 +- docs/sql-ref-syntax-ddl-declare-variable.md | 2 +- python/docs/source/reference/pyspark.ss/index.rst | 2 +- python/pyspark/ml/connect/io_utils.py | 2 +- python/pyspark/ml/connect/tuning.py | 2 +- python/pyspark/ml/deepspeed/deepspeed_distributor.py | 2 +- python/pyspark/ml/dl_util.py | 2 +- .../pyspark/ml/tests/connect/test_connect_function.py | 2 +- python/pyspark/ml/tests/test_dl_util.py | 2 +- python/pyspark/ml/tests/test_functions.py | 4 ++-- python/pyspark/ml/tests/test_param.py | 4 ++-- python/pyspark/ml/torch/distributor.py | 4 ++-- python/pyspark/pandas/accessors.py | 2 +- python/pyspark/pandas/base.py | 2 +- python/pyspark/pandas/frame.py | 4 ++-- python/pyspark/sql/connect/plan.py | 2 +- python/pyspark/sql/dataframe.py | 4 ++-- python/pyspark/sql/functions/builtin.py | 10 +++++----- python/pyspark/sql/readwriter.py | 2 +- .../pyspark/sql/tests/connect/test_connect_function.py | 2 +- .../tests/streaming/test_streaming_foreach_batch.py | 2 +- python/pyspark/sql/tests/test_udtf.py | 2 +- python/pyspark/sql/udtf.py | 2 +- python/pyspark/streaming/tests/test_dstream.py | 2 +- python/pyspark/worker_util.py | 4 ++-- .../connect/execution/ExecuteGrpcResponseSender.scala | 2 +- 35 files changed, 50 insertions(+), 50 deletions(-) diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml index d0a50b2b4aa74..542fa567dea69 100644 --- a/.github/workflows/update_build_status.yml +++ b/.github/workflows/update_build_status.yml @@ -72,7 +72,7 @@ jobs: } catch (error) { console.error(error) // Run not found. This can happen when the PR author removes GitHub Actions runs or - // disalbes GitHub Actions. + // disables GitHub Actions. continue } diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 9c825a99be180..e320981783ecc 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -2922,7 +2922,7 @@ setClassUnion("characterOrstructTypeOrColumn", c("character", "structType", "Col #' @details #' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType} #' with the specified \code{schema} or array of \code{structType} if \code{as.json.array} is set -#' to \code{TRUE}. If the string is unparseable, the Column will contain the value NA. +#' to \code{TRUE}. If the string is unparsable, the Column will contain the value NA. #' #' @rdname column_collection_functions #' @param as.json.array indicating if input string is JSON array of objects or a single object. @@ -3004,7 +3004,7 @@ setMethod("schema_of_json", signature(x = "characterOrColumn"), #' @details #' \code{from_csv}: Parses a column containing a CSV string into a Column of \code{structType} #' with the specified \code{schema}. -#' If the string is unparseable, the Column will contain the value NA. +#' If the string is unparsable, the Column will contain the value NA. #' #' @rdname column_collection_functions #' @aliases from_csv from_csv,Column,characterOrstructTypeOrColumn-method diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index 61e174de9ac56..4ccec991bb07b 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -60,7 +60,7 @@ writeObject <- function(con, object, writeType = TRUE) { if (type %in% c("integer", "character", "logical", "double", "numeric")) { if (is.na(object[[1]])) { # Uses the first element for now to keep the behavior same as R before - # 4.2.0. This is wrong because we should differenciate c(NA) from a + # 4.2.0. This is wrong because we should differentiate c(NA) from a # single NA as the former means array(null) and the latter means null # in Spark SQL. However, it requires non-trivial comparison to distinguish # both in R. We should ideally fix this. diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java index 02a38eac5b409..6e9bd548f5327 100644 --- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java +++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java @@ -251,17 +251,17 @@ AppShufflePartitionInfo getOrCreateAppShufflePartitionInfo( // Higher shuffleMergeId seen for the shuffle ID meaning new stage attempt is being // run for the shuffle ID. Close and clean up old shuffleMergeId files, // happens in the indeterminate stage retries - AppAttemptShuffleMergeId currrentAppAttemptShuffleMergeId = + AppAttemptShuffleMergeId currentAppAttemptShuffleMergeId = new AppAttemptShuffleMergeId(appShuffleInfo.appId, appShuffleInfo.attemptId, shuffleId, latestShuffleMergeId); logger.info("{}: creating a new shuffle merge metadata since received " + "shuffleMergeId {} is higher than latest shuffleMergeId {}", MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$, - currrentAppAttemptShuffleMergeId), + currentAppAttemptShuffleMergeId), MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, shuffleMergeId), MDC.of(LogKeys.LATEST_SHUFFLE_MERGE_ID$.MODULE$, latestShuffleMergeId)); submitCleanupTask(() -> - closeAndDeleteOutdatedPartitions(currrentAppAttemptShuffleMergeId, + closeAndDeleteOutdatedPartitions(currentAppAttemptShuffleMergeId, mergePartitionsInfo.shuffleMergePartitions)); return new AppShuffleMergePartitionsInfo(shuffleMergeId, false); } else { diff --git a/connector/connect/docs/client-connection-string.md b/connector/connect/docs/client-connection-string.md index 37b2956a5c44a..df371c5beaaac 100644 --- a/connector/connect/docs/client-connection-string.md +++ b/connector/connect/docs/client-connection-string.md @@ -2,7 +2,7 @@ From the client perspective, Spark Connect mostly behaves as any other GRPC client and can be configured as such. However, to make it easy to use from -different programming languages and to have a homogenous connection surface +different programming languages and to have a homogeneous connection surface this document proposes what the user surface is for connecting to a Spark Connect endpoint. @@ -136,7 +136,7 @@ server_url = "sc://myhost.com:443/;use_ssl=true;token=ABCDEFG" As mentioned above, Spark Connect uses a regular GRPC client and the server path cannot be configured to remain compatible with the GRPC standard and HTTP. For -example the following examles are invalid. +example the following examples are invalid. ```python server_url = "sc://myhost.com:443/mypathprefix/;token=AAAAAAA" diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 7d0e78738095e..6fd14ce31a68c 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -114,8 +114,8 @@ def select_lines(code) range = Range.new(start + 1, endline - 1) trimmed = trim_codeblock(lines[range]) # Filter out possible example tags of overlapped labels. - taggs_filtered = trimmed.select { |l| !l.include? '$example ' } - result += taggs_filtered.join + tags_filtered = trimmed.select { |l| !l.include? '$example ' } + result += tags_filtered.join result += "\n" end result diff --git a/docs/core-migration-guide.md b/docs/core-migration-guide.md index 88bad6c5d1b9f..958e442545dcd 100644 --- a/docs/core-migration-guide.md +++ b/docs/core-migration-guide.md @@ -62,7 +62,7 @@ license: | ## Upgrading from Core 3.3 to 3.4 -- Since Spark 3.4, Spark driver will own `PersistentVolumnClaim`s and try to reuse if they are not assigned to live executors. To restore the behavior before Spark 3.4, you can set `spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and `spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`. +- Since Spark 3.4, Spark driver will own `PersistentVolumeClaim`s and try to reuse if they are not assigned to live executors. To restore the behavior before Spark 3.4, you can set `spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and `spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`. - Since Spark 3.4, Spark driver will track shuffle data when dynamic allocation is enabled without shuffle service. To restore the behavior before Spark 3.4, you can set `spark.dynamicAllocation.shuffleTracking.enabled` to `false`. diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index aefa979946a6c..b6f847ff533f5 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -673,7 +673,7 @@ To use a custom metrics.properties for the application master and executors, upd false Set to true for applications that have higher security requirements and prefer that their - secret is not saved in the db. The shuffle data of such applications wll not be recovered after + secret is not saved in the db. The shuffle data of such applications will not be recovered after the External Shuffle Service restarts. 3.5.0 diff --git a/docs/security.md b/docs/security.md index c7d3fd5f8c36f..81173d5f01ce7 100644 --- a/docs/security.md +++ b/docs/security.md @@ -72,7 +72,7 @@ secrets to be secure. false Set to true for applications that have higher security requirements and prefer that their - secret is not saved in the db. The shuffle data of such applications wll not be recovered after + secret is not saved in the db. The shuffle data of such applications will not be recovered after the External Shuffle Service restarts. 3.5.0 diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index d828436e77340..4f8e0dc1a3917 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -372,7 +372,7 @@ SPARK_MASTER_OPTS supports the following system properties: The pattern for app ID generation based on Java `String.format` method. The default value is `app-%s-%04d` which represents the existing app id string, e.g., - `app-20231031224509-0008`. Plesae be careful to generate unique IDs. + `app-20231031224509-0008`. Please be careful to generate unique IDs. 4.0.0 diff --git a/docs/sql-ref-syntax-ddl-declare-variable.md b/docs/sql-ref-syntax-ddl-declare-variable.md index ba9857bf1917a..41ecba1364361 100644 --- a/docs/sql-ref-syntax-ddl-declare-variable.md +++ b/docs/sql-ref-syntax-ddl-declare-variable.md @@ -83,7 +83,7 @@ DECLARE OR REPLACE five = 55; -- Explicitly declare the default value of a variable using the keyword `DEFAULT` DECLARE VARIABLE size DEFAULT 6; --- STRING variable initialialized to `NULL` +-- STRING variable initialized to `NULL` DECLARE some_var STRING; ``` diff --git a/python/docs/source/reference/pyspark.ss/index.rst b/python/docs/source/reference/pyspark.ss/index.rst index 2cb0b1216eff9..440228134fac9 100644 --- a/python/docs/source/reference/pyspark.ss/index.rst +++ b/python/docs/source/reference/pyspark.ss/index.rst @@ -20,7 +20,7 @@ Structured Streaming ==================== -This page gives an overview of all public Structed Streaming API. +This page gives an overview of all public Structured Streaming API. .. toctree:: :maxdepth: 2 diff --git a/python/pyspark/ml/connect/io_utils.py b/python/pyspark/ml/connect/io_utils.py index c401e3e76676a..8d93426915d42 100644 --- a/python/pyspark/ml/connect/io_utils.py +++ b/python/pyspark/ml/connect/io_utils.py @@ -74,7 +74,7 @@ class ParamsReadWrite(Params): def _get_extra_metadata(self) -> Any: """ - Returns exta metadata of the instance + Returns extra metadata of the instance """ return None diff --git a/python/pyspark/ml/connect/tuning.py b/python/pyspark/ml/connect/tuning.py index cdb606048a59a..190fc683acf7d 100644 --- a/python/pyspark/ml/connect/tuning.py +++ b/python/pyspark/ml/connect/tuning.py @@ -170,7 +170,7 @@ def _parallelFitTasks( if active_session is None: raise RuntimeError( - "An active SparkSession is required for running cross valiator fit tasks." + "An active SparkSession is required for running cross validator fit tasks." ) def get_single_task(index: int, param_map: Any) -> Callable[[], Tuple[int, float]]: diff --git a/python/pyspark/ml/deepspeed/deepspeed_distributor.py b/python/pyspark/ml/deepspeed/deepspeed_distributor.py index 4ac5ff2fb4207..3fd1d3bb32463 100644 --- a/python/pyspark/ml/deepspeed/deepspeed_distributor.py +++ b/python/pyspark/ml/deepspeed/deepspeed_distributor.py @@ -49,7 +49,7 @@ def __init__( Parameters ---------- numGpus: int - The number of GPUs to use per node (analagous to num_gpus in deepspeed command). + The number of GPUs to use per node (analogous to num_gpus in deepspeed command). nnodes: int The number of nodes that should be used for the run. localMode: bool diff --git a/python/pyspark/ml/dl_util.py b/python/pyspark/ml/dl_util.py index 8ead529d7b729..3b87049ef2777 100644 --- a/python/pyspark/ml/dl_util.py +++ b/python/pyspark/ml/dl_util.py @@ -27,7 +27,7 @@ class FunctionPickler: This class provides a way to pickle a function and its arguments. It also provides a way to create a script that can run a function with arguments if they have them pickled to a file. - It also provides a way of extracting the conents of a pickle file. + It also provides a way of extracting the contents of a pickle file. """ @staticmethod diff --git a/python/pyspark/ml/tests/connect/test_connect_function.py b/python/pyspark/ml/tests/connect/test_connect_function.py index 393d38fdc426a..7d3a115ab0619 100644 --- a/python/pyspark/ml/tests/connect/test_connect_function.py +++ b/python/pyspark/ml/tests/connect/test_connect_function.py @@ -43,7 +43,7 @@ def setUpClass(cls): # Disable the shared namespace so pyspark.sql.functions, etc point the regular # PySpark libraries. os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1" - cls.connect = cls.spark # Switch Spark Connect session and regular PySpark sesion. + cls.connect = cls.spark # Switch Spark Connect session and regular PySpark session. cls.spark = PySparkSession._instantiatedSession assert cls.spark is not None diff --git a/python/pyspark/ml/tests/test_dl_util.py b/python/pyspark/ml/tests/test_dl_util.py index e5e2c6bc191d8..c130cf1ff6b9d 100644 --- a/python/pyspark/ml/tests/test_dl_util.py +++ b/python/pyspark/ml/tests/test_dl_util.py @@ -137,7 +137,7 @@ def _are_two_files_identical(self, fpath1: str, fpath2: str) -> bool: "", ), ( - "Check if it creates the correct file with only suffix + boddy", + "Check if it creates the correct file with only suffix + body", "", "print('goodbye')", ), diff --git a/python/pyspark/ml/tests/test_functions.py b/python/pyspark/ml/tests/test_functions.py index e67e46ded67bd..7719b2b27e0ab 100644 --- a/python/pyspark/ml/tests/test_functions.py +++ b/python/pyspark/ml/tests/test_functions.py @@ -265,14 +265,14 @@ def predict(a, b, c): with self.assertRaisesRegex(Exception, "Model expected 3 inputs, but received 4 columns"): preds = self.df.withColumn("preds", sum_cols(*columns)).toPandas() - # muliple scalar columns with one tensor_input_shape => single numpy array + # multiple scalar columns with one tensor_input_shape => single numpy array sum_cols = predict_batch_udf( array_sum_fn, return_type=DoubleType(), batch_size=5, input_tensor_shapes=[[4]] ) preds = self.df.withColumn("preds", sum_cols(struct(*columns))).toPandas() self.assertTrue(np.array_equal(np.sum(self.data, axis=1), preds["preds"].to_numpy())) - # muliple scalar columns with wrong tensor_input_shape => ERROR + # multiple scalar columns with wrong tensor_input_shape => ERROR sum_cols = predict_batch_udf( array_sum_fn, return_type=DoubleType(), batch_size=5, input_tensor_shapes=[[3]] ) diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py index 8df50a5963e6b..0aa9827124954 100644 --- a/python/pyspark/ml/tests/test_param.py +++ b/python/pyspark/ml/tests/test_param.py @@ -368,12 +368,12 @@ def test_default_params_transferred(self): self.assertFalse(binarizer.isSet(binarizer.outputCol)) self.assertEqual(result[0][0], 1.0) - def test_lr_evaluate_invaild_type(self): + def test_lr_evaluate_invalid_type(self): lr = LinearRegressionModel() invalid_type = "" self.assertRaises(TypeError, lr.evaluate, invalid_type) - def test_glr_evaluate_invaild_type(self): + def test_glr_evaluate_invalid_type(self): glr = GeneralizedLinearRegressionModel() invalid_type = "" self.assertRaises(TypeError, glr.evaluate, invalid_type) diff --git a/python/pyspark/ml/torch/distributor.py b/python/pyspark/ml/torch/distributor.py index 62a71c5a96af4..ef86f38b716b7 100644 --- a/python/pyspark/ml/torch/distributor.py +++ b/python/pyspark/ml/torch/distributor.py @@ -232,10 +232,10 @@ def _get_num_tasks(self) -> int: def _validate_input_params(self) -> None: if self.num_processes <= 0: - raise ValueError("num_proccesses has to be a positive integer") + raise ValueError("num_processes has to be a positive integer") def _check_encryption(self) -> None: - """Checks to see if the user requires encrpytion of data. + """Checks to see if the user requires encryption of data. If required, throw an exception since we don't support that. Raises diff --git a/python/pyspark/pandas/accessors.py b/python/pyspark/pandas/accessors.py index 4c36f7976af83..77757e4b60873 100644 --- a/python/pyspark/pandas/accessors.py +++ b/python/pyspark/pandas/accessors.py @@ -936,7 +936,7 @@ def _transform_batch( def pandas_concat(*series: pd.Series) -> pd.DataFrame: # The input can only be a DataFrame for struct from Spark 3.0. - # This works around makeing the input as a frame. See SPARK-27240 + # This works around making the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = columns return pdf diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py index bc54d8b9b17cb..01e23214d662d 100644 --- a/python/pyspark/pandas/base.py +++ b/python/pyspark/pandas/base.py @@ -1123,7 +1123,7 @@ def shift( Shift Series/Index by desired number of periods. .. note:: the current implementation of shift uses Spark's Window without - specifying partition specification. This leads to moveing all data into + specifying partition specification. This leads to moving all data into a single partition in a single machine and could cause serious performance degradation. Avoid this method with very large datasets. diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 49aa49f65e35b..f315d59a4fe94 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -7686,7 +7686,7 @@ def _sort( if na_position not in ("first", "last"): raise ValueError("invalid na_position: '{}'".format(na_position)) - # Mapper: Get a spark colum + # Mapper: Get a spark column # n function for (ascending, na_position) combination mapper = { (True, "first"): PySparkColumn.asc_nulls_first, @@ -9808,7 +9808,7 @@ def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame": if is_all_string_type: # Handling string type columns - # We will retrive the `count`, `unique`, `top` and `freq`. + # We will retrieve the `count`, `unique`, `top` and `freq`. internal = self._internal.resolved_copy exprs_string = [ internal.spark_column_for(psser._column_label) for psser in psser_string diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index b387ca1d4e508..34f11768bcbc0 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -205,7 +205,7 @@ def _parameters_to_print(self, parameters: Mapping[str, Any]) -> Mapping[str, An try: params[name] = getattr(self, "_" + name) except AttributeError: - pass # Simpy ignore + pass # Simply ignore return params def print(self, indent: int = 0) -> str: diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 085a1a629634a..0ea0eef50c0f3 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -2549,7 +2549,7 @@ def join( pyspark.errors.exceptions.captured.AnalysisException: Column name#0 are ambiguous... A better approach is to assign aliases to the dataframes, and then reference - the ouptut columns from the join operation using these aliases: + the output columns from the join operation using these aliases: >>> df.alias("a").join( ... df.alias("b"), sf.col("a.name") == sf.col("b.name"), "outer" @@ -3907,7 +3907,7 @@ def groupingSets( groupingSets : sequence of sequence of columns or str Individual set of columns to group on. cols : :class:`Column` or str - Addional grouping columns specified by users. + Additional grouping columns specified by users. Those columns are shown as the output columns after aggregation. Returns diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 391bc3db7a86f..4b4c164055eaf 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -15832,7 +15832,7 @@ def split_part(src: "ColumnOrName", delimiter: "ColumnOrName", partNum: "ColumnO Parameters ---------- src : :class:`~pyspark.sql.Column` or column name - A column of string to be splited. + A column of string to be split. delimiter : :class:`~pyspark.sql.Column` or column name A column of string, the delimiter used for split. partNum : :class:`~pyspark.sql.Column` or column name @@ -19618,7 +19618,7 @@ def from_json( """ Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType` as keys type, :class:`StructType` or :class:`ArrayType` with - the specified schema. Returns `null`, in the case of an unparseable string. + the specified schema. Returns `null`, in the case of an unparsable string. .. versionadded:: 2.1.0 @@ -20230,7 +20230,7 @@ def from_xml( ) -> Column: """ Parses a column containing a XML string to a row with - the specified schema. Returns `null`, in the case of an unparseable string. + the specified schema. Returns `null`, in the case of an unparsable string. .. versionadded:: 4.0.0 @@ -22624,7 +22624,7 @@ def transform_keys(col: "ColumnOrName", f: Callable[[Column, Column], Column]) - Returns ------- :class:`~pyspark.sql.Column` - a new map of enties where new keys were calculated by applying given function to + a new map of entries where new keys were calculated by applying given function to each key value argument. Examples @@ -22664,7 +22664,7 @@ def transform_values(col: "ColumnOrName", f: Callable[[Column, Column], Column]) Returns ------- :class:`~pyspark.sql.Column` - a new map of enties where new values were calculated by applying given function to + a new map of entries where new values were calculated by applying given function to each key value argument. Examples diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 4744bdf861d37..2113f0707f910 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -215,7 +215,7 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataFrameReader": Parameters ---------- **options : dict - The dictionary of string keys and prmitive-type values. + The dictionary of string keys and primitive-type values. Examples -------- diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py index e29873173cc3a..b7a02efcd5e2b 100644 --- a/python/pyspark/sql/tests/connect/test_connect_function.py +++ b/python/pyspark/sql/tests/connect/test_connect_function.py @@ -54,7 +54,7 @@ def setUpClass(cls): # Disable the shared namespace so pyspark.sql.functions, etc point the regular # PySpark libraries. os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1" - cls.connect = cls.spark # Switch Spark Connect session and regular PySpark sesion. + cls.connect = cls.spark # Switch Spark Connect session and regular PySpark session. cls.spark = PySparkSession._instantiatedSession assert cls.spark is not None diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py index de8f30baebca5..9db66aa252ee6 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py @@ -146,7 +146,7 @@ def func(df: DataFrame, batch_id: int): def my_test_function_2(): return 2 - def test_streaming_foreach_batch_fuction_calling(self): + def test_streaming_foreach_batch_function_calling(self): def my_test_function_3(): return 3 diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py index 206cfd7dc4885..8447edfbbb15d 100644 --- a/python/pyspark/sql/tests/test_udtf.py +++ b/python/pyspark/sql/tests/test_udtf.py @@ -1345,7 +1345,7 @@ def eval(self, a, b): assertSchemaEqual(df.schema, expected_schema) assertDataFrameEqual(df, expected_results) - def test_udtf_with_analyze_arbitary_number_arguments(self): + def test_udtf_with_analyze_arbitrary_number_arguments(self): class TestUDTF: @staticmethod def analyze(*args: AnalyzeArgument) -> AnalyzeResult: diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py index 5ce3e2dfd2a9e..cf4f976fd93b3 100644 --- a/python/pyspark/sql/udtf.py +++ b/python/pyspark/sql/udtf.py @@ -148,7 +148,7 @@ class AnalyzeResult: The schema that the Python UDTF will return. withSinglePartition: bool If true, the UDTF is specifying for Catalyst to repartition all rows of the input TABLE - argument to one collection for consumption by exactly one instance of the correpsonding + argument to one collection for consumption by exactly one instance of the corresponding UDTF class. partitionBy: sequence of :class:`PartitioningColumn` If non-empty, this is a sequence of expressions that the UDTF is specifying for Catalyst to diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py index 046247763c0b3..4c9633db311a6 100644 --- a/python/pyspark/streaming/tests/test_dstream.py +++ b/python/pyspark/streaming/tests/test_dstream.py @@ -403,7 +403,7 @@ def failed_func(rdd1, rdd2): self.fail("a failed func should throw an error") - def test_failed_func_with_reseting_failure(self): + def test_failed_func_with_resetting_failure(self): input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) diff --git a/python/pyspark/worker_util.py b/python/pyspark/worker_util.py index 81c05ce94eb65..5c758d3f83fe6 100644 --- a/python/pyspark/worker_util.py +++ b/python/pyspark/worker_util.py @@ -107,8 +107,8 @@ def setup_memory_limits(memory_limit_mb: int) -> None: except (resource.error, OSError, ValueError) as e: # not all systems support resource limits, so warn instead of failing - curent = currentframe() - lineno = getframeinfo(curent).lineno + 1 if curent is not None else 0 + current = currentframe() + lineno = getframeinfo(current).lineno + 1 if current is not None else 0 if "__file__" in globals(): print( warnings.formatwarning( diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala index 051093fcad277..44b634af95ca9 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala @@ -291,7 +291,7 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message]( assert(finished == false) } else { // If it wasn't sent, time deadline must have been reached before stream became available, - // or it was intterupted. Will exit in the next loop iterattion. + // or it was interrupted. Will exit in the next loop iterattion. assert(deadlineLimitReached || interrupted) } } else if (streamFinished) {