From 812fd437476a92bc3ba91801fff59354abf4a6de Mon Sep 17 00:00:00 2001 From: noahnovsak Date: Mon, 31 Jul 2023 11:25:23 +0200 Subject: [PATCH] dask: distinguish between PCA and IncrementalPCA --- Orange/projection/pca.py | 28 ++++++++++++++++++---------- Orange/tests/test_pca.py | 7 ++++--- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/Orange/projection/pca.py b/Orange/projection/pca.py index ae206c9dad5..717c848f1d5 100644 --- a/Orange/projection/pca.py +++ b/Orange/projection/pca.py @@ -11,6 +11,11 @@ from sklearn.utils.extmath import svd_flip, safe_sparse_dot from sklearn.utils.validation import check_is_fitted +try: + import dask_ml.decomposition as dask_decomposition +except ImportError: + dask_decomposition = skl_decomposition + import Orange.data from Orange.statistics import util as ut from Orange.data import Variable @@ -275,18 +280,13 @@ def _initialize_wrapped(self, X=None, Y=None): params["n_components"] = min(*X.shape, params["n_components"]) if isinstance(X, da.Array) or isinstance(Y, da.Array): - try: - import dask_ml.decomposition as dask_decomposition - + if dask_decomposition is skl_decomposition: + warnings.warn("dask_ml is not installed. Using sklearn instead.") + else: if params["iterated_power"] == "auto": params["iterated_power"] = 0 - del params["tol"] - # use IPCA instead of PCA due to memory issues - return dask_decomposition.IncrementalPCA(**params) - - except ImportError: - warnings.warn("dask_ml is not installed. Using sklearn instead.") + return dask_decomposition.PCA(**params) return self.__wraps__(**params) @@ -340,8 +340,16 @@ def __init__(self, n_components=None, whiten=False, copy=True, super().__init__(preprocessors=preprocessors) self.params = vars() + def _initialize_wrapped(self, X=None, Y=None): + if isinstance(X, da.Array) or isinstance(Y, da.Array): + if dask_decomposition is skl_decomposition: + warnings.warn("dask_ml is not installed. Using sklearn instead.") + else: + return dask_decomposition.IncrementalPCA(**self.params) + return self.__wraps__(**self.params) + def fit(self, X, Y=None): - proj = self.__wraps__(**self.params) + proj = self._initialize_wrapped(X, Y) proj = proj.fit(X, Y) return IncrementalPCAModel(proj, self.domain, len(proj.components_)) diff --git a/Orange/tests/test_pca.py b/Orange/tests/test_pca.py index adcc20a3302..1d7ebd89eec 100644 --- a/Orange/tests/test_pca.py +++ b/Orange/tests/test_pca.py @@ -156,8 +156,9 @@ def test_improved_randomized_pca_sparse_data(self): pca.singular_values_, rpca.singular_values_, decimal=8 ) - def test_incremental_pca(self): - data = self.ionosphere + @with_dasktable + def test_incremental_pca(self, prepare_table): + data = prepare_table(self.ionosphere) self.__ipca_test_helper(data, n_com=3, min_xpl_var=0.49) self.__ipca_test_helper(data, n_com=32, min_xpl_var=1) @@ -169,7 +170,7 @@ def __ipca_test_helper(self, data, n_com, min_xpl_var): self.assertEqual(n_com, pca_model.n_components) self.assertEqual((n_com, data.X.shape[1]), pca_model.components_.shape) proj = np.dot(data.X - pca_model.mean_, pca_model.components_.T) - np.testing.assert_almost_equal(pca_model(data).X, proj) + np.testing.assert_almost_equal(pca_model(data).X, np.asarray(proj)) pc1_ipca = pca_model.components_[0] self.assertAlmostEqual(np.linalg.norm(pc1_ipca), 1) pc1_pca = PCA(n_components=n_com)(data).components_[0]