diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml
index d0a50b2b4aa74..542fa567dea69 100644
--- a/.github/workflows/update_build_status.yml
+++ b/.github/workflows/update_build_status.yml
@@ -72,7 +72,7 @@ jobs:
} catch (error) {
console.error(error)
// Run not found. This can happen when the PR author removes GitHub Actions runs or
- // disalbes GitHub Actions.
+ // disables GitHub Actions.
continue
}
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9c825a99be180..e320981783ecc 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -2922,7 +2922,7 @@ setClassUnion("characterOrstructTypeOrColumn", c("character", "structType", "Col
#' @details
#' \code{from_json}: Parses a column containing a JSON string into a Column of \code{structType}
#' with the specified \code{schema} or array of \code{structType} if \code{as.json.array} is set
-#' to \code{TRUE}. If the string is unparseable, the Column will contain the value NA.
+#' to \code{TRUE}. If the string is unparsable, the Column will contain the value NA.
#'
#' @rdname column_collection_functions
#' @param as.json.array indicating if input string is JSON array of objects or a single object.
@@ -3004,7 +3004,7 @@ setMethod("schema_of_json", signature(x = "characterOrColumn"),
#' @details
#' \code{from_csv}: Parses a column containing a CSV string into a Column of \code{structType}
#' with the specified \code{schema}.
-#' If the string is unparseable, the Column will contain the value NA.
+#' If the string is unparsable, the Column will contain the value NA.
#'
#' @rdname column_collection_functions
#' @aliases from_csv from_csv,Column,characterOrstructTypeOrColumn-method
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 61e174de9ac56..4ccec991bb07b 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -60,7 +60,7 @@ writeObject <- function(con, object, writeType = TRUE) {
if (type %in% c("integer", "character", "logical", "double", "numeric")) {
if (is.na(object[[1]])) {
# Uses the first element for now to keep the behavior same as R before
- # 4.2.0. This is wrong because we should differenciate c(NA) from a
+ # 4.2.0. This is wrong because we should differentiate c(NA) from a
# single NA as the former means array(null) and the latter means null
# in Spark SQL. However, it requires non-trivial comparison to distinguish
# both in R. We should ideally fix this.
diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
index 02a38eac5b409..6e9bd548f5327 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
@@ -251,17 +251,17 @@ AppShufflePartitionInfo getOrCreateAppShufflePartitionInfo(
// Higher shuffleMergeId seen for the shuffle ID meaning new stage attempt is being
// run for the shuffle ID. Close and clean up old shuffleMergeId files,
// happens in the indeterminate stage retries
- AppAttemptShuffleMergeId currrentAppAttemptShuffleMergeId =
+ AppAttemptShuffleMergeId currentAppAttemptShuffleMergeId =
new AppAttemptShuffleMergeId(appShuffleInfo.appId, appShuffleInfo.attemptId,
shuffleId, latestShuffleMergeId);
logger.info("{}: creating a new shuffle merge metadata since received " +
"shuffleMergeId {} is higher than latest shuffleMergeId {}",
MDC.of(LogKeys.APP_ATTEMPT_SHUFFLE_MERGE_ID$.MODULE$,
- currrentAppAttemptShuffleMergeId),
+ currentAppAttemptShuffleMergeId),
MDC.of(LogKeys.SHUFFLE_MERGE_ID$.MODULE$, shuffleMergeId),
MDC.of(LogKeys.LATEST_SHUFFLE_MERGE_ID$.MODULE$, latestShuffleMergeId));
submitCleanupTask(() ->
- closeAndDeleteOutdatedPartitions(currrentAppAttemptShuffleMergeId,
+ closeAndDeleteOutdatedPartitions(currentAppAttemptShuffleMergeId,
mergePartitionsInfo.shuffleMergePartitions));
return new AppShuffleMergePartitionsInfo(shuffleMergeId, false);
} else {
diff --git a/connector/connect/docs/client-connection-string.md b/connector/connect/docs/client-connection-string.md
index 37b2956a5c44a..df371c5beaaac 100644
--- a/connector/connect/docs/client-connection-string.md
+++ b/connector/connect/docs/client-connection-string.md
@@ -2,7 +2,7 @@
From the client perspective, Spark Connect mostly behaves as any other GRPC
client and can be configured as such. However, to make it easy to use from
-different programming languages and to have a homogenous connection surface
+different programming languages and to have a homogeneous connection surface
this document proposes what the user surface is for connecting to a
Spark Connect endpoint.
@@ -136,7 +136,7 @@ server_url = "sc://myhost.com:443/;use_ssl=true;token=ABCDEFG"
As mentioned above, Spark Connect uses a regular GRPC client and the server path
cannot be configured to remain compatible with the GRPC standard and HTTP. For
-example the following examles are invalid.
+example the following examples are invalid.
```python
server_url = "sc://myhost.com:443/mypathprefix/;token=AAAAAAA"
diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb
index 7d0e78738095e..6fd14ce31a68c 100644
--- a/docs/_plugins/include_example.rb
+++ b/docs/_plugins/include_example.rb
@@ -114,8 +114,8 @@ def select_lines(code)
range = Range.new(start + 1, endline - 1)
trimmed = trim_codeblock(lines[range])
# Filter out possible example tags of overlapped labels.
- taggs_filtered = trimmed.select { |l| !l.include? '$example ' }
- result += taggs_filtered.join
+ tags_filtered = trimmed.select { |l| !l.include? '$example ' }
+ result += tags_filtered.join
result += "\n"
end
result
diff --git a/docs/core-migration-guide.md b/docs/core-migration-guide.md
index 88bad6c5d1b9f..958e442545dcd 100644
--- a/docs/core-migration-guide.md
+++ b/docs/core-migration-guide.md
@@ -62,7 +62,7 @@ license: |
## Upgrading from Core 3.3 to 3.4
-- Since Spark 3.4, Spark driver will own `PersistentVolumnClaim`s and try to reuse if they are not assigned to live executors. To restore the behavior before Spark 3.4, you can set `spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and `spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`.
+- Since Spark 3.4, Spark driver will own `PersistentVolumeClaim`s and try to reuse if they are not assigned to live executors. To restore the behavior before Spark 3.4, you can set `spark.kubernetes.driver.ownPersistentVolumeClaim` to `false` and `spark.kubernetes.driver.reusePersistentVolumeClaim` to `false`.
- Since Spark 3.4, Spark driver will track shuffle data when dynamic allocation is enabled without shuffle service. To restore the behavior before Spark 3.4, you can set `spark.dynamicAllocation.shuffleTracking.enabled` to `false`.
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index aefa979946a6c..b6f847ff533f5 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -673,7 +673,7 @@ To use a custom metrics.properties for the application master and executors, upd
false |
Set to true for applications that have higher security requirements and prefer that their
- secret is not saved in the db. The shuffle data of such applications wll not be recovered after
+ secret is not saved in the db. The shuffle data of such applications will not be recovered after
the External Shuffle Service restarts.
|
3.5.0 |
diff --git a/docs/security.md b/docs/security.md
index c7d3fd5f8c36f..81173d5f01ce7 100644
--- a/docs/security.md
+++ b/docs/security.md
@@ -72,7 +72,7 @@ secrets to be secure.
false |
Set to true for applications that have higher security requirements and prefer that their
- secret is not saved in the db. The shuffle data of such applications wll not be recovered after
+ secret is not saved in the db. The shuffle data of such applications will not be recovered after
the External Shuffle Service restarts.
|
3.5.0 |
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index d828436e77340..4f8e0dc1a3917 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -372,7 +372,7 @@ SPARK_MASTER_OPTS supports the following system properties:
The pattern for app ID generation based on Java `String.format` method.
The default value is `app-%s-%04d` which represents the existing app id string, e.g.,
- `app-20231031224509-0008`. Plesae be careful to generate unique IDs.
+ `app-20231031224509-0008`. Please be careful to generate unique IDs.
|
4.0.0 |
diff --git a/docs/sql-ref-syntax-ddl-declare-variable.md b/docs/sql-ref-syntax-ddl-declare-variable.md
index ba9857bf1917a..41ecba1364361 100644
--- a/docs/sql-ref-syntax-ddl-declare-variable.md
+++ b/docs/sql-ref-syntax-ddl-declare-variable.md
@@ -83,7 +83,7 @@ DECLARE OR REPLACE five = 55;
-- Explicitly declare the default value of a variable using the keyword `DEFAULT`
DECLARE VARIABLE size DEFAULT 6;
--- STRING variable initialialized to `NULL`
+-- STRING variable initialized to `NULL`
DECLARE some_var STRING;
```
diff --git a/python/docs/source/reference/pyspark.ss/index.rst b/python/docs/source/reference/pyspark.ss/index.rst
index 2cb0b1216eff9..440228134fac9 100644
--- a/python/docs/source/reference/pyspark.ss/index.rst
+++ b/python/docs/source/reference/pyspark.ss/index.rst
@@ -20,7 +20,7 @@
Structured Streaming
====================
-This page gives an overview of all public Structed Streaming API.
+This page gives an overview of all public Structured Streaming API.
.. toctree::
:maxdepth: 2
diff --git a/python/pyspark/ml/connect/io_utils.py b/python/pyspark/ml/connect/io_utils.py
index c401e3e76676a..8d93426915d42 100644
--- a/python/pyspark/ml/connect/io_utils.py
+++ b/python/pyspark/ml/connect/io_utils.py
@@ -74,7 +74,7 @@ class ParamsReadWrite(Params):
def _get_extra_metadata(self) -> Any:
"""
- Returns exta metadata of the instance
+ Returns extra metadata of the instance
"""
return None
diff --git a/python/pyspark/ml/connect/tuning.py b/python/pyspark/ml/connect/tuning.py
index cdb606048a59a..190fc683acf7d 100644
--- a/python/pyspark/ml/connect/tuning.py
+++ b/python/pyspark/ml/connect/tuning.py
@@ -170,7 +170,7 @@ def _parallelFitTasks(
if active_session is None:
raise RuntimeError(
- "An active SparkSession is required for running cross valiator fit tasks."
+ "An active SparkSession is required for running cross validator fit tasks."
)
def get_single_task(index: int, param_map: Any) -> Callable[[], Tuple[int, float]]:
diff --git a/python/pyspark/ml/deepspeed/deepspeed_distributor.py b/python/pyspark/ml/deepspeed/deepspeed_distributor.py
index 4ac5ff2fb4207..3fd1d3bb32463 100644
--- a/python/pyspark/ml/deepspeed/deepspeed_distributor.py
+++ b/python/pyspark/ml/deepspeed/deepspeed_distributor.py
@@ -49,7 +49,7 @@ def __init__(
Parameters
----------
numGpus: int
- The number of GPUs to use per node (analagous to num_gpus in deepspeed command).
+ The number of GPUs to use per node (analogous to num_gpus in deepspeed command).
nnodes: int
The number of nodes that should be used for the run.
localMode: bool
diff --git a/python/pyspark/ml/dl_util.py b/python/pyspark/ml/dl_util.py
index 8ead529d7b729..3b87049ef2777 100644
--- a/python/pyspark/ml/dl_util.py
+++ b/python/pyspark/ml/dl_util.py
@@ -27,7 +27,7 @@ class FunctionPickler:
This class provides a way to pickle a function and its arguments.
It also provides a way to create a script that can run a
function with arguments if they have them pickled to a file.
- It also provides a way of extracting the conents of a pickle file.
+ It also provides a way of extracting the contents of a pickle file.
"""
@staticmethod
diff --git a/python/pyspark/ml/tests/connect/test_connect_function.py b/python/pyspark/ml/tests/connect/test_connect_function.py
index 393d38fdc426a..7d3a115ab0619 100644
--- a/python/pyspark/ml/tests/connect/test_connect_function.py
+++ b/python/pyspark/ml/tests/connect/test_connect_function.py
@@ -43,7 +43,7 @@ def setUpClass(cls):
# Disable the shared namespace so pyspark.sql.functions, etc point the regular
# PySpark libraries.
os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1"
- cls.connect = cls.spark # Switch Spark Connect session and regular PySpark sesion.
+ cls.connect = cls.spark # Switch Spark Connect session and regular PySpark session.
cls.spark = PySparkSession._instantiatedSession
assert cls.spark is not None
diff --git a/python/pyspark/ml/tests/test_dl_util.py b/python/pyspark/ml/tests/test_dl_util.py
index e5e2c6bc191d8..c130cf1ff6b9d 100644
--- a/python/pyspark/ml/tests/test_dl_util.py
+++ b/python/pyspark/ml/tests/test_dl_util.py
@@ -137,7 +137,7 @@ def _are_two_files_identical(self, fpath1: str, fpath2: str) -> bool:
"",
),
(
- "Check if it creates the correct file with only suffix + boddy",
+ "Check if it creates the correct file with only suffix + body",
"",
"print('goodbye')",
),
diff --git a/python/pyspark/ml/tests/test_functions.py b/python/pyspark/ml/tests/test_functions.py
index e67e46ded67bd..7719b2b27e0ab 100644
--- a/python/pyspark/ml/tests/test_functions.py
+++ b/python/pyspark/ml/tests/test_functions.py
@@ -265,14 +265,14 @@ def predict(a, b, c):
with self.assertRaisesRegex(Exception, "Model expected 3 inputs, but received 4 columns"):
preds = self.df.withColumn("preds", sum_cols(*columns)).toPandas()
- # muliple scalar columns with one tensor_input_shape => single numpy array
+ # multiple scalar columns with one tensor_input_shape => single numpy array
sum_cols = predict_batch_udf(
array_sum_fn, return_type=DoubleType(), batch_size=5, input_tensor_shapes=[[4]]
)
preds = self.df.withColumn("preds", sum_cols(struct(*columns))).toPandas()
self.assertTrue(np.array_equal(np.sum(self.data, axis=1), preds["preds"].to_numpy()))
- # muliple scalar columns with wrong tensor_input_shape => ERROR
+ # multiple scalar columns with wrong tensor_input_shape => ERROR
sum_cols = predict_batch_udf(
array_sum_fn, return_type=DoubleType(), batch_size=5, input_tensor_shapes=[[3]]
)
diff --git a/python/pyspark/ml/tests/test_param.py b/python/pyspark/ml/tests/test_param.py
index 8df50a5963e6b..0aa9827124954 100644
--- a/python/pyspark/ml/tests/test_param.py
+++ b/python/pyspark/ml/tests/test_param.py
@@ -368,12 +368,12 @@ def test_default_params_transferred(self):
self.assertFalse(binarizer.isSet(binarizer.outputCol))
self.assertEqual(result[0][0], 1.0)
- def test_lr_evaluate_invaild_type(self):
+ def test_lr_evaluate_invalid_type(self):
lr = LinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, lr.evaluate, invalid_type)
- def test_glr_evaluate_invaild_type(self):
+ def test_glr_evaluate_invalid_type(self):
glr = GeneralizedLinearRegressionModel()
invalid_type = ""
self.assertRaises(TypeError, glr.evaluate, invalid_type)
diff --git a/python/pyspark/ml/torch/distributor.py b/python/pyspark/ml/torch/distributor.py
index 62a71c5a96af4..ef86f38b716b7 100644
--- a/python/pyspark/ml/torch/distributor.py
+++ b/python/pyspark/ml/torch/distributor.py
@@ -232,10 +232,10 @@ def _get_num_tasks(self) -> int:
def _validate_input_params(self) -> None:
if self.num_processes <= 0:
- raise ValueError("num_proccesses has to be a positive integer")
+ raise ValueError("num_processes has to be a positive integer")
def _check_encryption(self) -> None:
- """Checks to see if the user requires encrpytion of data.
+ """Checks to see if the user requires encryption of data.
If required, throw an exception since we don't support that.
Raises
diff --git a/python/pyspark/pandas/accessors.py b/python/pyspark/pandas/accessors.py
index 4c36f7976af83..77757e4b60873 100644
--- a/python/pyspark/pandas/accessors.py
+++ b/python/pyspark/pandas/accessors.py
@@ -936,7 +936,7 @@ def _transform_batch(
def pandas_concat(*series: pd.Series) -> pd.DataFrame:
# The input can only be a DataFrame for struct from Spark 3.0.
- # This works around makeing the input as a frame. See SPARK-27240
+ # This works around making the input as a frame. See SPARK-27240
pdf = pd.concat(series, axis=1)
pdf.columns = columns
return pdf
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
index bc54d8b9b17cb..01e23214d662d 100644
--- a/python/pyspark/pandas/base.py
+++ b/python/pyspark/pandas/base.py
@@ -1123,7 +1123,7 @@ def shift(
Shift Series/Index by desired number of periods.
.. note:: the current implementation of shift uses Spark's Window without
- specifying partition specification. This leads to moveing all data into
+ specifying partition specification. This leads to moving all data into
a single partition in a single machine and could cause serious
performance degradation. Avoid this method with very large datasets.
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 49aa49f65e35b..f315d59a4fe94 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -7686,7 +7686,7 @@ def _sort(
if na_position not in ("first", "last"):
raise ValueError("invalid na_position: '{}'".format(na_position))
- # Mapper: Get a spark colum
+ # Mapper: Get a spark column
# n function for (ascending, na_position) combination
mapper = {
(True, "first"): PySparkColumn.asc_nulls_first,
@@ -9808,7 +9808,7 @@ def describe(self, percentiles: Optional[List[float]] = None) -> "DataFrame":
if is_all_string_type:
# Handling string type columns
- # We will retrive the `count`, `unique`, `top` and `freq`.
+ # We will retrieve the `count`, `unique`, `top` and `freq`.
internal = self._internal.resolved_copy
exprs_string = [
internal.spark_column_for(psser._column_label) for psser in psser_string
diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
index b387ca1d4e508..34f11768bcbc0 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -205,7 +205,7 @@ def _parameters_to_print(self, parameters: Mapping[str, Any]) -> Mapping[str, An
try:
params[name] = getattr(self, "_" + name)
except AttributeError:
- pass # Simpy ignore
+ pass # Simply ignore
return params
def print(self, indent: int = 0) -> str:
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 085a1a629634a..0ea0eef50c0f3 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -2549,7 +2549,7 @@ def join(
pyspark.errors.exceptions.captured.AnalysisException: Column name#0 are ambiguous...
A better approach is to assign aliases to the dataframes, and then reference
- the ouptut columns from the join operation using these aliases:
+ the output columns from the join operation using these aliases:
>>> df.alias("a").join(
... df.alias("b"), sf.col("a.name") == sf.col("b.name"), "outer"
@@ -3907,7 +3907,7 @@ def groupingSets(
groupingSets : sequence of sequence of columns or str
Individual set of columns to group on.
cols : :class:`Column` or str
- Addional grouping columns specified by users.
+ Additional grouping columns specified by users.
Those columns are shown as the output columns after aggregation.
Returns
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index 391bc3db7a86f..4b4c164055eaf 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -15832,7 +15832,7 @@ def split_part(src: "ColumnOrName", delimiter: "ColumnOrName", partNum: "ColumnO
Parameters
----------
src : :class:`~pyspark.sql.Column` or column name
- A column of string to be splited.
+ A column of string to be split.
delimiter : :class:`~pyspark.sql.Column` or column name
A column of string, the delimiter used for split.
partNum : :class:`~pyspark.sql.Column` or column name
@@ -19618,7 +19618,7 @@ def from_json(
"""
Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType`
as keys type, :class:`StructType` or :class:`ArrayType` with
- the specified schema. Returns `null`, in the case of an unparseable string.
+ the specified schema. Returns `null`, in the case of an unparsable string.
.. versionadded:: 2.1.0
@@ -20230,7 +20230,7 @@ def from_xml(
) -> Column:
"""
Parses a column containing a XML string to a row with
- the specified schema. Returns `null`, in the case of an unparseable string.
+ the specified schema. Returns `null`, in the case of an unparsable string.
.. versionadded:: 4.0.0
@@ -22624,7 +22624,7 @@ def transform_keys(col: "ColumnOrName", f: Callable[[Column, Column], Column]) -
Returns
-------
:class:`~pyspark.sql.Column`
- a new map of enties where new keys were calculated by applying given function to
+ a new map of entries where new keys were calculated by applying given function to
each key value argument.
Examples
@@ -22664,7 +22664,7 @@ def transform_values(col: "ColumnOrName", f: Callable[[Column, Column], Column])
Returns
-------
:class:`~pyspark.sql.Column`
- a new map of enties where new values were calculated by applying given function to
+ a new map of entries where new values were calculated by applying given function to
each key value argument.
Examples
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 4744bdf861d37..2113f0707f910 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -215,7 +215,7 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataFrameReader":
Parameters
----------
**options : dict
- The dictionary of string keys and prmitive-type values.
+ The dictionary of string keys and primitive-type values.
Examples
--------
diff --git a/python/pyspark/sql/tests/connect/test_connect_function.py b/python/pyspark/sql/tests/connect/test_connect_function.py
index e29873173cc3a..b7a02efcd5e2b 100644
--- a/python/pyspark/sql/tests/connect/test_connect_function.py
+++ b/python/pyspark/sql/tests/connect/test_connect_function.py
@@ -54,7 +54,7 @@ def setUpClass(cls):
# Disable the shared namespace so pyspark.sql.functions, etc point the regular
# PySpark libraries.
os.environ["PYSPARK_NO_NAMESPACE_SHARE"] = "1"
- cls.connect = cls.spark # Switch Spark Connect session and regular PySpark sesion.
+ cls.connect = cls.spark # Switch Spark Connect session and regular PySpark session.
cls.spark = PySparkSession._instantiatedSession
assert cls.spark is not None
diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
index de8f30baebca5..9db66aa252ee6 100644
--- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
+++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_batch.py
@@ -146,7 +146,7 @@ def func(df: DataFrame, batch_id: int):
def my_test_function_2():
return 2
- def test_streaming_foreach_batch_fuction_calling(self):
+ def test_streaming_foreach_batch_function_calling(self):
def my_test_function_3():
return 3
diff --git a/python/pyspark/sql/tests/test_udtf.py b/python/pyspark/sql/tests/test_udtf.py
index 206cfd7dc4885..8447edfbbb15d 100644
--- a/python/pyspark/sql/tests/test_udtf.py
+++ b/python/pyspark/sql/tests/test_udtf.py
@@ -1345,7 +1345,7 @@ def eval(self, a, b):
assertSchemaEqual(df.schema, expected_schema)
assertDataFrameEqual(df, expected_results)
- def test_udtf_with_analyze_arbitary_number_arguments(self):
+ def test_udtf_with_analyze_arbitrary_number_arguments(self):
class TestUDTF:
@staticmethod
def analyze(*args: AnalyzeArgument) -> AnalyzeResult:
diff --git a/python/pyspark/sql/udtf.py b/python/pyspark/sql/udtf.py
index 5ce3e2dfd2a9e..cf4f976fd93b3 100644
--- a/python/pyspark/sql/udtf.py
+++ b/python/pyspark/sql/udtf.py
@@ -148,7 +148,7 @@ class AnalyzeResult:
The schema that the Python UDTF will return.
withSinglePartition: bool
If true, the UDTF is specifying for Catalyst to repartition all rows of the input TABLE
- argument to one collection for consumption by exactly one instance of the correpsonding
+ argument to one collection for consumption by exactly one instance of the corresponding
UDTF class.
partitionBy: sequence of :class:`PartitioningColumn`
If non-empty, this is a sequence of expressions that the UDTF is specifying for Catalyst to
diff --git a/python/pyspark/streaming/tests/test_dstream.py b/python/pyspark/streaming/tests/test_dstream.py
index 046247763c0b3..4c9633db311a6 100644
--- a/python/pyspark/streaming/tests/test_dstream.py
+++ b/python/pyspark/streaming/tests/test_dstream.py
@@ -403,7 +403,7 @@ def failed_func(rdd1, rdd2):
self.fail("a failed func should throw an error")
- def test_failed_func_with_reseting_failure(self):
+ def test_failed_func_with_resetting_failure(self):
input = [self.sc.parallelize([d], 1) for d in range(4)]
input_stream = self.ssc.queueStream(input)
diff --git a/python/pyspark/worker_util.py b/python/pyspark/worker_util.py
index 81c05ce94eb65..5c758d3f83fe6 100644
--- a/python/pyspark/worker_util.py
+++ b/python/pyspark/worker_util.py
@@ -107,8 +107,8 @@ def setup_memory_limits(memory_limit_mb: int) -> None:
except (resource.error, OSError, ValueError) as e:
# not all systems support resource limits, so warn instead of failing
- curent = currentframe()
- lineno = getframeinfo(curent).lineno + 1 if curent is not None else 0
+ current = currentframe()
+ lineno = getframeinfo(current).lineno + 1 if current is not None else 0
if "__file__" in globals():
print(
warnings.formatwarning(
diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
index 051093fcad277..44b634af95ca9 100644
--- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
+++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/execution/ExecuteGrpcResponseSender.scala
@@ -291,7 +291,7 @@ private[connect] class ExecuteGrpcResponseSender[T <: Message](
assert(finished == false)
} else {
// If it wasn't sent, time deadline must have been reached before stream became available,
- // or it was intterupted. Will exit in the next loop iterattion.
+ // or it was interrupted. Will exit in the next loop iterattion.
assert(deadlineLimitReached || interrupted)
}
} else if (streamFinished) {