From 8b26c1760fa5c577d62534ad8c827863afb9a055 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Thu, 7 Mar 2024 10:09:37 +0000 Subject: [PATCH 1/4] simplify-fold-strategy --- autoemulate/compare.py | 38 ++++++++------------------------------ autoemulate/cv.py | 23 ----------------------- autoemulate/printing.py | 6 +++--- 3 files changed, 11 insertions(+), 56 deletions(-) delete mode 100644 autoemulate/cv.py diff --git a/autoemulate/compare.py b/autoemulate/compare.py index c09d9d4c..a907106f 100644 --- a/autoemulate/compare.py +++ b/autoemulate/compare.py @@ -5,6 +5,7 @@ from sklearn.decomposition import PCA from sklearn.metrics import make_scorer from sklearn.model_selection import cross_validate +from sklearn.model_selection import KFold from sklearn.model_selection import PredefinedSplit from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline @@ -14,7 +15,6 @@ from autoemulate.cross_validate import _run_cv from autoemulate.cross_validate import _update_scores_df -from autoemulate.cv import CV_REGISTRY from autoemulate.data_splitting import _split_data from autoemulate.emulators import MODEL_REGISTRY from autoemulate.hyperparam_searching import _optimize_params @@ -55,8 +55,7 @@ def setup( scaler=StandardScaler(), reduce_dim=False, dim_reducer=PCA(), - fold_strategy="kfold", - folds=5, + cross_validator=KFold(n_splits=5, shuffle=True), n_jobs=None, model_subset=None, log_to_file=False, @@ -90,10 +89,9 @@ def setup( explain 95% of the variance. Other methods can have slightly different n_components parameter inputs, see the sklearn documentation for more details. Dimension reduction is always performed after scaling. - fold_strategy : str - Cross-validation strategy, currently either "kfold" or "stratified_kfold". - folds : int - Number of folds. + cross_validator : sklearn.model_selection object + Cross-validation strategy to use. Defaults to KFold with 5 splits and shuffle=True. + Can be any object in `sklearn.model_selection` that generates train/test indices. n_jobs : int Number of jobs to run in parallel. `None` means 1, `-1` means using all processors. model_subset : list @@ -117,7 +115,7 @@ def setup( dim_reducer, ) self.metrics = self._get_metrics(METRIC_REGISTRY) - self.cv = self._get_cv(CV_REGISTRY, fold_strategy, folds) + self.cross_validator = cross_validator self.param_search = param_search self.search_type = param_search_type self.param_search_iters = param_search_iters @@ -128,7 +126,6 @@ def setup( self.is_set_up = True self.dim_reducer = dim_reducer self.reduce_dim = reduce_dim - self.folds = folds self.cv_results = {} self.print_setup() @@ -170,25 +167,6 @@ def _get_metrics(self, METRIC_REGISTRY): """ return [metric for metric in METRIC_REGISTRY.values()] - def _get_cv(self, CV_REGISTRY, fold_strategy, folds): - """Get cross-validation strategy from REGISTRY - - Parameters - ---------- - CV_REGISTRY : dict - Registry of cross-validation strategies. - fold_strategy : str - Name of the cross-validation strategy. Currently only "kfold" is supported. - folds : int - Number of folds. - - Returns - ------- - cv : sklearn.model_selection.KFold - An instance of the KFold class. - """ - return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True) - def compare(self): """Compares the emulator models on the data. self.setup() must be run first. @@ -220,7 +198,7 @@ def compare(self): self.models[i] = _optimize_params( X=self.X[self.train_idxs], y=self.y[self.train_idxs], - cv=self.cv, + cv=self.cross_validator, model=self.models[i], search_type=self.search_type, niter=self.param_search_iters, @@ -233,7 +211,7 @@ def compare(self): fitted_model, cv_results = _run_cv( X=self.X[self.train_idxs], y=self.y[self.train_idxs], - cv=self.cv, + cv=self.cross_validator, model=self.models[i], metrics=self.metrics, n_jobs=self.n_jobs, diff --git a/autoemulate/cv.py b/autoemulate/cv.py deleted file mode 100644 index 13307899..00000000 --- a/autoemulate/cv.py +++ /dev/null @@ -1,23 +0,0 @@ -from sklearn.model_selection import KFold -from sklearn.model_selection import StratifiedKFold - - -def kfold(folds=None, shuffle=True): - """scikit-learn class for k-fold cross validation. - - Parameters - ---------- - folds : int - Number of folds. - shuffle : bool - Whether or not to shuffle the data before splitting. - - Returns - ------- - kfold : sklearn.model_selection.KFold - An instance of the KFold class. - """ - return KFold(n_splits=folds, shuffle=shuffle) - - -CV_REGISTRY = {"kfold": kfold} diff --git a/autoemulate/printing.py b/autoemulate/printing.py index 933b1d5f..48d999f0 100644 --- a/autoemulate/printing.py +++ b/autoemulate/printing.py @@ -98,13 +98,13 @@ def _print_setup(cls): else "None" ), str(cls.cv.__class__.__name__ if cls.cv is not None else "None"), - str(cls.folds), + str(cls.cv.get_n_splits()), str(cls.n_jobs if cls.n_jobs is not None else "1"), ], index=[ "Simulation input shape (X)", "Simulation output shape (y)", - "# test set samples (test_set_size)", + "# hold-out set samples (test_set_size)", "Do hyperparameter search (param_search)", "Type of hyperparameter search (search_type)", "# sampled parameter settings (param_search_iters)", @@ -112,7 +112,7 @@ def _print_setup(cls): "Scaler (scaler)", "Dimensionality reduction before fitting (reduce_dim)", "Dimensionality reduction method (dim_reducer)", - "Cross-validation strategy (fold_strategy)", + "Cross-validation strategy (cross_validator)", "# folds (folds)", "# parallel jobs (n_jobs)", ], From ba7d9ff30f894d9f78a63d4b635b2011f807c9a8 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Thu, 7 Mar 2024 10:22:30 +0000 Subject: [PATCH 2/4] change cv to cross_validator --- autoemulate/printing.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/autoemulate/printing.py b/autoemulate/printing.py index 48d999f0..1f4bd02a 100644 --- a/autoemulate/printing.py +++ b/autoemulate/printing.py @@ -97,8 +97,12 @@ def _print_setup(cls): if cls.dim_reducer is not None else "None" ), - str(cls.cv.__class__.__name__ if cls.cv is not None else "None"), - str(cls.cv.get_n_splits()), + str( + cls.cross_validator.__class__.__name__ + if cls.cross_validator is not None + else "None" + ), + str(cls.cross_validator.get_n_splits()), str(cls.n_jobs if cls.n_jobs is not None else "1"), ], index=[ @@ -113,7 +117,7 @@ def _print_setup(cls): "Dimensionality reduction before fitting (reduce_dim)", "Dimensionality reduction method (dim_reducer)", "Cross-validation strategy (cross_validator)", - "# folds (folds)", + "# cv-folds", "# parallel jobs (n_jobs)", ], columns=["Values"], From 734ec14c15b2ad8a00785835b2fd9c441d51b03e Mon Sep 17 00:00:00 2001 From: mastoffel Date: Thu, 7 Mar 2024 10:32:32 +0000 Subject: [PATCH 3/4] remove folds printing --- autoemulate/printing.py | 2 -- tests/test_compare.py | 1 - 2 files changed, 3 deletions(-) diff --git a/autoemulate/printing.py b/autoemulate/printing.py index 1f4bd02a..f12b720b 100644 --- a/autoemulate/printing.py +++ b/autoemulate/printing.py @@ -102,7 +102,6 @@ def _print_setup(cls): if cls.cross_validator is not None else "None" ), - str(cls.cross_validator.get_n_splits()), str(cls.n_jobs if cls.n_jobs is not None else "1"), ], index=[ @@ -117,7 +116,6 @@ def _print_setup(cls): "Dimensionality reduction before fitting (reduce_dim)", "Dimensionality reduction method (dim_reducer)", "Cross-validation strategy (cross_validator)", - "# cv-folds", "# parallel jobs (n_jobs)", ], columns=["Values"], diff --git a/tests/test_compare.py b/tests/test_compare.py index d2323fcb..95652a9c 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -6,7 +6,6 @@ from sklearn.preprocessing import StandardScaler from autoemulate.compare import AutoEmulate -from autoemulate.cv import CV_REGISTRY from autoemulate.emulators import GaussianProcessMOGP from autoemulate.emulators import MODEL_REGISTRY from autoemulate.emulators import RandomForest From ce049856953cee1792e65bc71c3275fca8387064 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Thu, 7 Mar 2024 10:51:15 +0000 Subject: [PATCH 4/4] add tests for scikit learn plugin classes --- tests/test_ui.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/test_ui.py diff --git a/tests/test_ui.py b/tests/test_ui.py new file mode 100644 index 00000000..0e779725 --- /dev/null +++ b/tests/test_ui.py @@ -0,0 +1,59 @@ +import numpy as np +from sklearn.decomposition import KernelPCA +from sklearn.decomposition import PCA +from sklearn.model_selection import KFold +from sklearn.model_selection import TimeSeriesSplit +from sklearn.preprocessing import MinMaxScaler +from sklearn.preprocessing import RobustScaler + +from autoemulate.compare import AutoEmulate + +# take fast fitting models for testing +model_subset = ["SecondOrderPolynomial", "RadialBasisFunctions"] + + +def test_scalers(): + X = np.random.rand(100, 5) + y = np.random.rand(100, 1) + + scalers = [MinMaxScaler(), RobustScaler()] + + for scaler in scalers: + ae = AutoEmulate() + ae.setup(X, y, scaler=scaler, model_subset=model_subset) + ae.compare() + ae.print_results() + + assert ae.best_model is not None + + +def test_dimension_reducers(): + X = np.random.rand(100, 10) + y = np.random.rand(100, 1) + + dim_reducers = [PCA(n_components=5), KernelPCA(n_components=5)] + + for dim_reducer in dim_reducers: + ae = AutoEmulate() + ae.setup( + X, y, reduce_dim=True, dim_reducer=dim_reducer, model_subset=model_subset + ) + ae.compare() + ae.print_results() + + assert ae.best_model is not None + + +def test_cross_validators(): + X = np.random.rand(100, 5) + y = np.random.rand(100, 1) + + cross_validators = [KFold(n_splits=5), TimeSeriesSplit(n_splits=5)] + + for cross_validator in cross_validators: + ae = AutoEmulate() + ae.setup(X, y, cross_validator=cross_validator, model_subset=model_subset) + ae.compare() + ae.print_results() + + assert ae.best_model is not None