Merge pull request #202 from alan-turing-institute/fold-strategy

Fold strategy
alan-turing-institute · Mar 7, 2024 · 07fe895 · 07fe895
2 parents 0f906d9 + ce04985
commit 07fe895
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 59 deletions.
diff --git a/autoemulate/compare.py b/autoemulate/compare.py
@@ -5,6 +5,7 @@
 from sklearn.decomposition import PCA
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
+from sklearn.model_selection import KFold
 from sklearn.model_selection import PredefinedSplit
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
@@ -14,7 +15,6 @@
 
 from autoemulate.cross_validate import _run_cv
 from autoemulate.cross_validate import _update_scores_df
-from autoemulate.cv import CV_REGISTRY
 from autoemulate.data_splitting import _split_data
 from autoemulate.emulators import MODEL_REGISTRY
 from autoemulate.hyperparam_searching import _optimize_params
@@ -55,8 +55,7 @@ def setup(
         scaler=StandardScaler(),
         reduce_dim=False,
         dim_reducer=PCA(),
-        fold_strategy="kfold",
-        folds=5,
+        cross_validator=KFold(n_splits=5, shuffle=True),
         n_jobs=None,
         model_subset=None,
         log_to_file=False,
@@ -90,10 +89,9 @@ def setup(
             explain 95% of the variance. Other methods can have slightly different n_components
             parameter inputs, see the sklearn documentation for more details. Dimension reduction
             is always performed after scaling.
-        fold_strategy : str
-            Cross-validation strategy, currently either "kfold" or "stratified_kfold".
-        folds : int
-            Number of folds.
+        cross_validator : sklearn.model_selection object
+            Cross-validation strategy to use. Defaults to KFold with 5 splits and shuffle=True.
+            Can be any object in `sklearn.model_selection` that generates train/test indices.
         n_jobs : int
             Number of jobs to run in parallel. `None` means 1, `-1` means using all processors.
         model_subset : list
@@ -117,7 +115,7 @@ def setup(
             dim_reducer,
         )
         self.metrics = self._get_metrics(METRIC_REGISTRY)
-        self.cv = self._get_cv(CV_REGISTRY, fold_strategy, folds)
+        self.cross_validator = cross_validator
         self.param_search = param_search
         self.search_type = param_search_type
         self.param_search_iters = param_search_iters
@@ -128,7 +126,6 @@ def setup(
         self.is_set_up = True
         self.dim_reducer = dim_reducer
         self.reduce_dim = reduce_dim
-        self.folds = folds
         self.cv_results = {}
 
         self.print_setup()
@@ -170,25 +167,6 @@ def _get_metrics(self, METRIC_REGISTRY):
         """
         return [metric for metric in METRIC_REGISTRY.values()]
 
-    def _get_cv(self, CV_REGISTRY, fold_strategy, folds):
-        """Get cross-validation strategy from REGISTRY
-
-        Parameters
-        ----------
-        CV_REGISTRY : dict
-            Registry of cross-validation strategies.
-        fold_strategy : str
-            Name of the cross-validation strategy. Currently only "kfold" is supported.
-        folds : int
-            Number of folds.
-
-        Returns
-        -------
-        cv : sklearn.model_selection.KFold
-            An instance of the KFold class.
-        """
-        return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True)
-
     def compare(self):
         """Compares the emulator models on the data. self.setup() must be run first.
 
@@ -220,7 +198,7 @@ def compare(self):
                     self.models[i] = _optimize_params(
                         X=self.X[self.train_idxs],
                         y=self.y[self.train_idxs],
-                        cv=self.cv,
+                        cv=self.cross_validator,
                         model=self.models[i],
                         search_type=self.search_type,
                         niter=self.param_search_iters,
@@ -233,7 +211,7 @@ def compare(self):
                 fitted_model, cv_results = _run_cv(
                     X=self.X[self.train_idxs],
                     y=self.y[self.train_idxs],
-                    cv=self.cv,
+                    cv=self.cross_validator,
                     model=self.models[i],
                     metrics=self.metrics,
                     n_jobs=self.n_jobs,

diff --git a/autoemulate/cv.py b/autoemulate/cv.py
diff --git a/autoemulate/printing.py b/autoemulate/printing.py
@@ -97,23 +97,25 @@ def _print_setup(cls):
                 if cls.dim_reducer is not None
                 else "None"
             ),
-            str(cls.cv.__class__.__name__ if cls.cv is not None else "None"),
-            str(cls.folds),
+            str(
+                cls.cross_validator.__class__.__name__
+                if cls.cross_validator is not None
+                else "None"
+            ),
             str(cls.n_jobs if cls.n_jobs is not None else "1"),
         ],
         index=[
             "Simulation input shape (X)",
             "Simulation output shape (y)",
-            "# test set samples (test_set_size)",
+            "# hold-out set samples (test_set_size)",
             "Do hyperparameter search (param_search)",
             "Type of hyperparameter search (search_type)",
             "# sampled parameter settings (param_search_iters)",
             "Scale data before fitting (scale)",
             "Scaler (scaler)",
             "Dimensionality reduction before fitting (reduce_dim)",
             "Dimensionality reduction method (dim_reducer)",
-            "Cross-validation strategy (fold_strategy)",
-            "# folds (folds)",
+            "Cross-validation strategy (cross_validator)",
             "# parallel jobs (n_jobs)",
         ],
         columns=["Values"],

diff --git a/tests/test_compare.py b/tests/test_compare.py
@@ -6,7 +6,6 @@
 from sklearn.preprocessing import StandardScaler
 
 from autoemulate.compare import AutoEmulate
-from autoemulate.cv import CV_REGISTRY
 from autoemulate.emulators import GaussianProcessMOGP
 from autoemulate.emulators import MODEL_REGISTRY
 from autoemulate.emulators import RandomForest

diff --git a/tests/test_ui.py b/tests/test_ui.py
@@ -0,0 +1,59 @@
+import numpy as np
+from sklearn.decomposition import KernelPCA
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+from sklearn.model_selection import TimeSeriesSplit
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import RobustScaler
+
+from autoemulate.compare import AutoEmulate
+
+# take fast fitting models for testing
+model_subset = ["SecondOrderPolynomial", "RadialBasisFunctions"]
+
+
+def test_scalers():
+    X = np.random.rand(100, 5)
+    y = np.random.rand(100, 1)
+
+    scalers = [MinMaxScaler(), RobustScaler()]
+
+    for scaler in scalers:
+        ae = AutoEmulate()
+        ae.setup(X, y, scaler=scaler, model_subset=model_subset)
+        ae.compare()
+        ae.print_results()
+
+        assert ae.best_model is not None
+
+
+def test_dimension_reducers():
+    X = np.random.rand(100, 10)
+    y = np.random.rand(100, 1)
+
+    dim_reducers = [PCA(n_components=5), KernelPCA(n_components=5)]
+
+    for dim_reducer in dim_reducers:
+        ae = AutoEmulate()
+        ae.setup(
+            X, y, reduce_dim=True, dim_reducer=dim_reducer, model_subset=model_subset
+        )
+        ae.compare()
+        ae.print_results()
+
+        assert ae.best_model is not None
+
+
+def test_cross_validators():
+    X = np.random.rand(100, 5)
+    y = np.random.rand(100, 1)
+
+    cross_validators = [KFold(n_splits=5), TimeSeriesSplit(n_splits=5)]
+
+    for cross_validator in cross_validators:
+        ae = AutoEmulate()
+        ae.setup(X, y, cross_validator=cross_validator, model_subset=model_subset)
+        ae.compare()
+        ae.print_results()
+
+        assert ae.best_model is not None