Skip to content

Commit

Permalink
[SPARK-49792][PYTHON][BUILD] Upgrade to numpy 2 for building and test…
Browse files Browse the repository at this point in the history
…ing Spark branches

### What changes were proposed in this pull request?
Upgrade numpy to 2.1.0 for building and testing Spark branches.

Failed tests are categorized into the following groups:
- Most of test failures fixed are related to pandas-dev/pandas#59838 (comment).
- Replaced np.mat with np.asmatrix.
- TODO: SPARK-49793

### Why are the changes needed?
Ensure compatibility with newer NumPy, which is utilized by Pandas (on Spark).

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing tests.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #48180 from xinrong-meng/np_upgrade.

Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
  • Loading branch information
xinrong-meng authored and dongjoon-hyun committed Oct 15, 2024
1 parent add4a9c commit 0e75d19
Show file tree
Hide file tree
Showing 12 changed files with 107 additions and 99 deletions.
6 changes: 3 additions & 3 deletions dev/infra/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ LABEL org.opencontainers.image.ref.name="Apache Spark Infra Image"
# Overwrite this label to avoid exposing the underlying Ubuntu OS version label
LABEL org.opencontainers.image.version=""

ENV FULL_REFRESH_DATE 20241002
ENV FULL_REFRESH_DATE 20241007

ENV DEBIAN_FRONTEND noninteractive
ENV DEBCONF_NONINTERACTIVE_SEEN true
Expand Down Expand Up @@ -91,10 +91,10 @@ RUN mkdir -p /usr/local/pypy/pypy3.9 && \
ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3.9 && \
ln -sf /usr/local/pypy/pypy3.9/bin/pypy /usr/local/bin/pypy3
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | pypy3
RUN pypy3 -m pip install 'numpy==1.26.4' 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml
RUN pypy3 -m pip install numpy 'six==1.16.0' 'pandas==2.2.3' scipy coverage matplotlib lxml


ARG BASIC_PIP_PKGS="numpy==1.26.4 pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
ARG BASIC_PIP_PKGS="numpy pyarrow>=15.0.0 six==1.16.0 pandas==2.2.3 scipy plotly>=4.8 mlflow>=2.8.1 coverage matplotlib openpyxl memory-profiler>=0.61.0 scikit-learn>=1.3.2"
# Python deps for Spark Connect
ARG CONNECT_PIP_PKGS="grpcio==1.62.0 grpcio-status==1.62.0 protobuf==4.25.1 googleapis-common-protos==1.56.4 graphviz==0.20.3"

Expand Down
8 changes: 4 additions & 4 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ class LinearSVC(
>>> model_path = temp_path + "/svm_model"
>>> model.save(model_path)
>>> model2 = LinearSVCModel.load(model_path)
>>> model.coefficients[0] == model2.coefficients[0]
>>> bool(model.coefficients[0] == model2.coefficients[0])
True
>>> model.intercept == model2.intercept
True
Expand Down Expand Up @@ -1210,7 +1210,7 @@ class LogisticRegression(
>>> model_path = temp_path + "/lr_model"
>>> blorModel.save(model_path)
>>> model2 = LogisticRegressionModel.load(model_path)
>>> blorModel.coefficients[0] == model2.coefficients[0]
>>> bool(blorModel.coefficients[0] == model2.coefficients[0])
True
>>> blorModel.intercept == model2.intercept
True
Expand Down Expand Up @@ -2038,9 +2038,9 @@ class RandomForestClassifier(
>>> result = model.transform(test0).head()
>>> result.prediction
0.0
>>> numpy.argmax(result.probability)
>>> int(numpy.argmax(result.probability))
0
>>> numpy.argmax(result.newRawPrediction)
>>> int(numpy.argmax(result.newRawPrediction))
0
>>> result.leafId
DenseVector([0.0, 0.0, 0.0])
Expand Down
10 changes: 5 additions & 5 deletions python/pyspark/ml/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ class LinearRegression(
True
>>> abs(model.transform(test0).head().newPrediction - (-1.0)) < 0.001
True
>>> abs(model.coefficients[0] - 1.0) < 0.001
>>> bool(abs(model.coefficients[0] - 1.0) < 0.001)
True
>>> abs(model.intercept - 0.0) < 0.001
True
Expand All @@ -283,11 +283,11 @@ class LinearRegression(
>>> model_path = temp_path + "/lr_model"
>>> model.save(model_path)
>>> model2 = LinearRegressionModel.load(model_path)
>>> model.coefficients[0] == model2.coefficients[0]
>>> bool(model.coefficients[0] == model2.coefficients[0])
True
>>> model.intercept == model2.intercept
>>> bool(model.intercept == model2.intercept)
True
>>> model.transform(test0).take(1) == model2.transform(test0).take(1)
>>> bool(model.transform(test0).take(1) == model2.transform(test0).take(1))
True
>>> model.numFeatures
1
Expand Down Expand Up @@ -2542,7 +2542,7 @@ class GeneralizedLinearRegression(
>>> model2 = GeneralizedLinearRegressionModel.load(model_path)
>>> model.intercept == model2.intercept
True
>>> model.coefficients[0] == model2.coefficients[0]
>>> bool(model.coefficients[0] == model2.coefficients[0])
True
>>> model.transform(df).take(1) == model2.transform(df).take(1)
True
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/ml/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import numpy as np

from pyspark.loose_version import LooseVersion
from pyspark.ml.functions import predict_batch_udf
from pyspark.sql.functions import array, struct, col
from pyspark.sql.types import ArrayType, DoubleType, IntegerType, StructType, StructField, FloatType
Expand Down Expand Up @@ -193,6 +194,10 @@ def predict(inputs):
batch_sizes = preds["preds"].to_numpy()
self.assertTrue(all(batch_sizes <= batch_size))

# TODO(SPARK-49793): enable the test below
@unittest.skipIf(
LooseVersion(np.__version__) >= LooseVersion("2"), "Caching does not work with numpy 2"
)
def test_caching(self):
def make_predict_fn():
# emulate loading a model, this should only be invoked once (per worker process)
Expand Down
2 changes: 1 addition & 1 deletion python/pyspark/ml/tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -706,7 +706,7 @@ class CrossValidator(
>>> cvModel = cv.fit(dataset)
>>> cvModel.getNumFolds()
3
>>> cvModel.avgMetrics[0]
>>> float(cvModel.avgMetrics[0])
0.5
>>> path = tempfile.mkdtemp()
>>> model_path = path + "/model"
Expand Down
25 changes: 14 additions & 11 deletions python/pyspark/mllib/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,9 @@ class LogisticRegressionModel(LinearClassificationModel):
>>> path = tempfile.mkdtemp()
>>> lrm.save(sc, path)
>>> sameModel = LogisticRegressionModel.load(sc, path)
>>> sameModel.predict(numpy.array([0.0, 1.0]))
>>> int(sameModel.predict(numpy.array([0.0, 1.0])))
1
>>> sameModel.predict(SparseVector(2, {0: 1.0}))
>>> int(sameModel.predict(SparseVector(2, {0: 1.0})))
0
>>> from shutil import rmtree
>>> try:
Expand Down Expand Up @@ -555,7 +555,7 @@ class SVMModel(LinearClassificationModel):
>>> svm.predict(sc.parallelize([[1.0]])).collect()
[1]
>>> svm.clearThreshold()
>>> svm.predict(numpy.array([1.0]))
>>> float(svm.predict(numpy.array([1.0])))
1.44...
>>> sparse_data = [
Expand All @@ -573,9 +573,9 @@ class SVMModel(LinearClassificationModel):
>>> path = tempfile.mkdtemp()
>>> svm.save(sc, path)
>>> sameModel = SVMModel.load(sc, path)
>>> sameModel.predict(SparseVector(2, {1: 1.0}))
>>> int(sameModel.predict(SparseVector(2, {1: 1.0})))
1
>>> sameModel.predict(SparseVector(2, {0: -1.0}))
>>> int(sameModel.predict(SparseVector(2, {0: -1.0})))
0
>>> from shutil import rmtree
>>> try:
Expand Down Expand Up @@ -756,27 +756,30 @@ class NaiveBayesModel(Saveable, Loader["NaiveBayesModel"]):
... LabeledPoint(1.0, [1.0, 0.0]),
... ]
>>> model = NaiveBayes.train(sc.parallelize(data))
>>> model.predict(numpy.array([0.0, 1.0]))
>>> float(model.predict(numpy.array([0.0, 1.0])))
0.0
>>> model.predict(numpy.array([1.0, 0.0]))
>>> float(model.predict(numpy.array([1.0, 0.0])))
1.0
>>> model.predict(sc.parallelize([[1.0, 0.0]])).collect()
>>> list(map(float, model.predict(sc.parallelize([[1.0, 0.0]])).collect()))
[1.0]
>>> sparse_data = [
... LabeledPoint(0.0, SparseVector(2, {1: 0.0})),
... LabeledPoint(0.0, SparseVector(2, {1: 1.0})),
... LabeledPoint(1.0, SparseVector(2, {0: 1.0}))
... ]
>>> model = NaiveBayes.train(sc.parallelize(sparse_data))
>>> model.predict(SparseVector(2, {1: 1.0}))
>>> float(model.predict(SparseVector(2, {1: 1.0})))
0.0
>>> model.predict(SparseVector(2, {0: 1.0}))
>>> float(model.predict(SparseVector(2, {0: 1.0})))
1.0
>>> import os, tempfile
>>> path = tempfile.mkdtemp()
>>> model.save(sc, path)
>>> sameModel = NaiveBayesModel.load(sc, path)
>>> sameModel.predict(SparseVector(2, {0: 1.0})) == model.predict(SparseVector(2, {0: 1.0}))
>>> bool((
... sameModel.predict(SparseVector(2, {0: 1.0})) ==
... model.predict(SparseVector(2, {0: 1.0}))
... ))
True
>>> from shutil import rmtree
>>> try:
Expand Down
4 changes: 2 additions & 2 deletions python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,9 +554,9 @@ class PCA:
... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
>>> model = PCA(2).fit(sc.parallelize(data))
>>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
>>> pcArray[0]
>>> float(pcArray[0])
1.648...
>>> pcArray[1]
>>> float(pcArray[1])
-4.013...
"""

Expand Down
42 changes: 21 additions & 21 deletions python/pyspark/mllib/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ def normalRDD(
>>> stats = x.stats()
>>> stats.count()
1000
>>> abs(stats.mean() - 0.0) < 0.1
>>> bool(abs(stats.mean() - 0.0) < 0.1)
True
>>> abs(stats.stdev() - 1.0) < 0.1
>>> bool(abs(stats.stdev() - 1.0) < 0.1)
True
"""
return callMLlibFunc("normalRDD", sc._jsc, size, numPartitions, seed)
Expand Down Expand Up @@ -186,10 +186,10 @@ def logNormalRDD(
>>> stats = x.stats()
>>> stats.count()
1000
>>> abs(stats.mean() - expMean) < 0.5
>>> bool(abs(stats.mean() - expMean) < 0.5)
True
>>> from math import sqrt
>>> abs(stats.stdev() - expStd) < 0.5
>>> bool(abs(stats.stdev() - expStd) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -238,7 +238,7 @@ def poissonRDD(
>>> abs(stats.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
>>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc("poissonRDD", sc._jsc, float(mean), size, numPartitions, seed)
Expand Down Expand Up @@ -285,7 +285,7 @@ def exponentialRDD(
>>> abs(stats.mean() - mean) < 0.5
True
>>> from math import sqrt
>>> abs(stats.stdev() - sqrt(mean)) < 0.5
>>> bool(abs(stats.stdev() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc("exponentialRDD", sc._jsc, float(mean), size, numPartitions, seed)
Expand Down Expand Up @@ -336,9 +336,9 @@ def gammaRDD(
>>> stats = x.stats()
>>> stats.count()
1000
>>> abs(stats.mean() - expMean) < 0.5
>>> bool(abs(stats.mean() - expMean) < 0.5)
True
>>> abs(stats.stdev() - expStd) < 0.5
>>> bool(abs(stats.stdev() - expStd) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -384,7 +384,7 @@ def uniformVectorRDD(
>>> mat = np.matrix(RandomRDDs.uniformVectorRDD(sc, 10, 10).collect())
>>> mat.shape
(10, 10)
>>> mat.max() <= 1.0 and mat.min() >= 0.0
>>> bool(mat.max() <= 1.0 and mat.min() >= 0.0)
True
>>> RandomRDDs.uniformVectorRDD(sc, 10, 10, 4).getNumPartitions()
4
Expand Down Expand Up @@ -430,9 +430,9 @@ def normalVectorRDD(
>>> mat = np.matrix(RandomRDDs.normalVectorRDD(sc, 100, 100, seed=1).collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - 0.0) < 0.1
>>> bool(abs(mat.mean() - 0.0) < 0.1)
True
>>> abs(mat.std() - 1.0) < 0.1
>>> bool(abs(mat.std() - 1.0) < 0.1)
True
"""
return callMLlibFunc("normalVectorRDD", sc._jsc, numRows, numCols, numPartitions, seed)
Expand Down Expand Up @@ -488,9 +488,9 @@ def logNormalVectorRDD(
>>> mat = np.matrix(m)
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - expMean) < 0.1
>>> bool(abs(mat.mean() - expMean) < 0.1)
True
>>> abs(mat.std() - expStd) < 0.1
>>> bool(abs(mat.std() - expStd) < 0.1)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -545,13 +545,13 @@ def poissonVectorRDD(
>>> import numpy as np
>>> mean = 100.0
>>> rdd = RandomRDDs.poissonVectorRDD(sc, mean, 100, 100, seed=1)
>>> mat = np.mat(rdd.collect())
>>> mat = np.asmatrix(rdd.collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - mean) < 0.5
>>> bool(abs(mat.mean() - mean) < 0.5)
True
>>> from math import sqrt
>>> abs(mat.std() - sqrt(mean)) < 0.5
>>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -599,13 +599,13 @@ def exponentialVectorRDD(
>>> import numpy as np
>>> mean = 0.5
>>> rdd = RandomRDDs.exponentialVectorRDD(sc, mean, 100, 100, seed=1)
>>> mat = np.mat(rdd.collect())
>>> mat = np.asmatrix(rdd.collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - mean) < 0.5
>>> bool(abs(mat.mean() - mean) < 0.5)
True
>>> from math import sqrt
>>> abs(mat.std() - sqrt(mean)) < 0.5
>>> bool(abs(mat.std() - sqrt(mean)) < 0.5)
True
"""
return callMLlibFunc(
Expand Down Expand Up @@ -662,9 +662,9 @@ def gammaVectorRDD(
>>> mat = np.matrix(RandomRDDs.gammaVectorRDD(sc, shape, scale, 100, 100, seed=1).collect())
>>> mat.shape
(100, 100)
>>> abs(mat.mean() - expMean) < 0.1
>>> bool(abs(mat.mean() - expMean) < 0.1)
True
>>> abs(mat.std() - expStd) < 0.1
>>> bool(abs(mat.std() - expStd) < 0.1)
True
"""
return callMLlibFunc(
Expand Down
Loading

0 comments on commit 0e75d19

Please sign in to comment.