From 8b26c1760fa5c577d62534ad8c827863afb9a055 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Thu, 7 Mar 2024 10:09:37 +0000
Subject: [PATCH 1/4] simplify-fold-strategy

---
 autoemulate/compare.py  | 38 ++++++++------------------------------
 autoemulate/cv.py       | 23 -----------------------
 autoemulate/printing.py |  6 +++---
 3 files changed, 11 insertions(+), 56 deletions(-)
 delete mode 100644 autoemulate/cv.py

diff --git a/autoemulate/compare.py b/autoemulate/compare.py
index c09d9d4c..a907106f 100644
--- a/autoemulate/compare.py
+++ b/autoemulate/compare.py
@@ -5,6 +5,7 @@
 from sklearn.decomposition import PCA
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
+from sklearn.model_selection import KFold
 from sklearn.model_selection import PredefinedSplit
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
@@ -14,7 +15,6 @@
 
 from autoemulate.cross_validate import _run_cv
 from autoemulate.cross_validate import _update_scores_df
-from autoemulate.cv import CV_REGISTRY
 from autoemulate.data_splitting import _split_data
 from autoemulate.emulators import MODEL_REGISTRY
 from autoemulate.hyperparam_searching import _optimize_params
@@ -55,8 +55,7 @@ def setup(
         scaler=StandardScaler(),
         reduce_dim=False,
         dim_reducer=PCA(),
-        fold_strategy="kfold",
-        folds=5,
+        cross_validator=KFold(n_splits=5, shuffle=True),
         n_jobs=None,
         model_subset=None,
         log_to_file=False,
@@ -90,10 +89,9 @@ def setup(
             explain 95% of the variance. Other methods can have slightly different n_components
             parameter inputs, see the sklearn documentation for more details. Dimension reduction
             is always performed after scaling.
-        fold_strategy : str
-            Cross-validation strategy, currently either "kfold" or "stratified_kfold".
-        folds : int
-            Number of folds.
+        cross_validator : sklearn.model_selection object
+            Cross-validation strategy to use. Defaults to KFold with 5 splits and shuffle=True.
+            Can be any object in `sklearn.model_selection` that generates train/test indices.
         n_jobs : int
             Number of jobs to run in parallel. `None` means 1, `-1` means using all processors.
         model_subset : list
@@ -117,7 +115,7 @@ def setup(
             dim_reducer,
         )
         self.metrics = self._get_metrics(METRIC_REGISTRY)
-        self.cv = self._get_cv(CV_REGISTRY, fold_strategy, folds)
+        self.cross_validator = cross_validator
         self.param_search = param_search
         self.search_type = param_search_type
         self.param_search_iters = param_search_iters
@@ -128,7 +126,6 @@ def setup(
         self.is_set_up = True
         self.dim_reducer = dim_reducer
         self.reduce_dim = reduce_dim
-        self.folds = folds
         self.cv_results = {}
 
         self.print_setup()
@@ -170,25 +167,6 @@ def _get_metrics(self, METRIC_REGISTRY):
         """
         return [metric for metric in METRIC_REGISTRY.values()]
 
-    def _get_cv(self, CV_REGISTRY, fold_strategy, folds):
-        """Get cross-validation strategy from REGISTRY
-
-        Parameters
-        ----------
-        CV_REGISTRY : dict
-            Registry of cross-validation strategies.
-        fold_strategy : str
-            Name of the cross-validation strategy. Currently only "kfold" is supported.
-        folds : int
-            Number of folds.
-
-        Returns
-        -------
-        cv : sklearn.model_selection.KFold
-            An instance of the KFold class.
-        """
-        return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True)
-
     def compare(self):
         """Compares the emulator models on the data. self.setup() must be run first.
 
@@ -220,7 +198,7 @@ def compare(self):
                     self.models[i] = _optimize_params(
                         X=self.X[self.train_idxs],
                         y=self.y[self.train_idxs],
-                        cv=self.cv,
+                        cv=self.cross_validator,
                         model=self.models[i],
                         search_type=self.search_type,
                         niter=self.param_search_iters,
@@ -233,7 +211,7 @@ def compare(self):
                 fitted_model, cv_results = _run_cv(
                     X=self.X[self.train_idxs],
                     y=self.y[self.train_idxs],
-                    cv=self.cv,
+                    cv=self.cross_validator,
                     model=self.models[i],
                     metrics=self.metrics,
                     n_jobs=self.n_jobs,
diff --git a/autoemulate/cv.py b/autoemulate/cv.py
deleted file mode 100644
index 13307899..00000000
--- a/autoemulate/cv.py
+++ /dev/null
@@ -1,23 +0,0 @@
-from sklearn.model_selection import KFold
-from sklearn.model_selection import StratifiedKFold
-
-
-def kfold(folds=None, shuffle=True):
-    """scikit-learn class for k-fold cross validation.
-
-    Parameters
-    ----------
-    folds : int
-            Number of folds.
-    shuffle : bool
-            Whether or not to shuffle the data before splitting.
-
-    Returns
-    -------
-    kfold : sklearn.model_selection.KFold
-            An instance of the KFold class.
-    """
-    return KFold(n_splits=folds, shuffle=shuffle)
-
-
-CV_REGISTRY = {"kfold": kfold}
diff --git a/autoemulate/printing.py b/autoemulate/printing.py
index 933b1d5f..48d999f0 100644
--- a/autoemulate/printing.py
+++ b/autoemulate/printing.py
@@ -98,13 +98,13 @@ def _print_setup(cls):
                 else "None"
             ),
             str(cls.cv.__class__.__name__ if cls.cv is not None else "None"),
-            str(cls.folds),
+            str(cls.cv.get_n_splits()),
             str(cls.n_jobs if cls.n_jobs is not None else "1"),
         ],
         index=[
             "Simulation input shape (X)",
             "Simulation output shape (y)",
-            "# test set samples (test_set_size)",
+            "# hold-out set samples (test_set_size)",
             "Do hyperparameter search (param_search)",
             "Type of hyperparameter search (search_type)",
             "# sampled parameter settings (param_search_iters)",
@@ -112,7 +112,7 @@ def _print_setup(cls):
             "Scaler (scaler)",
             "Dimensionality reduction before fitting (reduce_dim)",
             "Dimensionality reduction method (dim_reducer)",
-            "Cross-validation strategy (fold_strategy)",
+            "Cross-validation strategy (cross_validator)",
             "# folds (folds)",
             "# parallel jobs (n_jobs)",
         ],

From ba7d9ff30f894d9f78a63d4b635b2011f807c9a8 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Thu, 7 Mar 2024 10:22:30 +0000
Subject: [PATCH 2/4] change cv to cross_validator

---
 autoemulate/printing.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/autoemulate/printing.py b/autoemulate/printing.py
index 48d999f0..1f4bd02a 100644
--- a/autoemulate/printing.py
+++ b/autoemulate/printing.py
@@ -97,8 +97,12 @@ def _print_setup(cls):
                 if cls.dim_reducer is not None
                 else "None"
             ),
-            str(cls.cv.__class__.__name__ if cls.cv is not None else "None"),
-            str(cls.cv.get_n_splits()),
+            str(
+                cls.cross_validator.__class__.__name__
+                if cls.cross_validator is not None
+                else "None"
+            ),
+            str(cls.cross_validator.get_n_splits()),
             str(cls.n_jobs if cls.n_jobs is not None else "1"),
         ],
         index=[
@@ -113,7 +117,7 @@ def _print_setup(cls):
             "Dimensionality reduction before fitting (reduce_dim)",
             "Dimensionality reduction method (dim_reducer)",
             "Cross-validation strategy (cross_validator)",
-            "# folds (folds)",
+            "# cv-folds",
             "# parallel jobs (n_jobs)",
         ],
         columns=["Values"],

From 734ec14c15b2ad8a00785835b2fd9c441d51b03e Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Thu, 7 Mar 2024 10:32:32 +0000
Subject: [PATCH 3/4] remove folds printing

---
 autoemulate/printing.py | 2 --
 tests/test_compare.py   | 1 -
 2 files changed, 3 deletions(-)

diff --git a/autoemulate/printing.py b/autoemulate/printing.py
index 1f4bd02a..f12b720b 100644
--- a/autoemulate/printing.py
+++ b/autoemulate/printing.py
@@ -102,7 +102,6 @@ def _print_setup(cls):
                 if cls.cross_validator is not None
                 else "None"
             ),
-            str(cls.cross_validator.get_n_splits()),
             str(cls.n_jobs if cls.n_jobs is not None else "1"),
         ],
         index=[
@@ -117,7 +116,6 @@ def _print_setup(cls):
             "Dimensionality reduction before fitting (reduce_dim)",
             "Dimensionality reduction method (dim_reducer)",
             "Cross-validation strategy (cross_validator)",
-            "# cv-folds",
             "# parallel jobs (n_jobs)",
         ],
         columns=["Values"],
diff --git a/tests/test_compare.py b/tests/test_compare.py
index d2323fcb..95652a9c 100644
--- a/tests/test_compare.py
+++ b/tests/test_compare.py
@@ -6,7 +6,6 @@
 from sklearn.preprocessing import StandardScaler
 
 from autoemulate.compare import AutoEmulate
-from autoemulate.cv import CV_REGISTRY
 from autoemulate.emulators import GaussianProcessMOGP
 from autoemulate.emulators import MODEL_REGISTRY
 from autoemulate.emulators import RandomForest

From ce049856953cee1792e65bc71c3275fca8387064 Mon Sep 17 00:00:00 2001
From: mastoffel <martin.adam.stoffel@gmail.com>
Date: Thu, 7 Mar 2024 10:51:15 +0000
Subject: [PATCH 4/4] add tests for scikit learn plugin classes

---
 tests/test_ui.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 tests/test_ui.py

diff --git a/tests/test_ui.py b/tests/test_ui.py
new file mode 100644
index 00000000..0e779725
--- /dev/null
+++ b/tests/test_ui.py
@@ -0,0 +1,59 @@
+import numpy as np
+from sklearn.decomposition import KernelPCA
+from sklearn.decomposition import PCA
+from sklearn.model_selection import KFold
+from sklearn.model_selection import TimeSeriesSplit
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import RobustScaler
+
+from autoemulate.compare import AutoEmulate
+
+# take fast fitting models for testing
+model_subset = ["SecondOrderPolynomial", "RadialBasisFunctions"]
+
+
+def test_scalers():
+    X = np.random.rand(100, 5)
+    y = np.random.rand(100, 1)
+
+    scalers = [MinMaxScaler(), RobustScaler()]
+
+    for scaler in scalers:
+        ae = AutoEmulate()
+        ae.setup(X, y, scaler=scaler, model_subset=model_subset)
+        ae.compare()
+        ae.print_results()
+
+        assert ae.best_model is not None
+
+
+def test_dimension_reducers():
+    X = np.random.rand(100, 10)
+    y = np.random.rand(100, 1)
+
+    dim_reducers = [PCA(n_components=5), KernelPCA(n_components=5)]
+
+    for dim_reducer in dim_reducers:
+        ae = AutoEmulate()
+        ae.setup(
+            X, y, reduce_dim=True, dim_reducer=dim_reducer, model_subset=model_subset
+        )
+        ae.compare()
+        ae.print_results()
+
+        assert ae.best_model is not None
+
+
+def test_cross_validators():
+    X = np.random.rand(100, 5)
+    y = np.random.rand(100, 1)
+
+    cross_validators = [KFold(n_splits=5), TimeSeriesSplit(n_splits=5)]
+
+    for cross_validator in cross_validators:
+        ae = AutoEmulate()
+        ae.setup(X, y, cross_validator=cross_validator, model_subset=model_subset)
+        ae.compare()
+        ae.print_results()
+
+        assert ae.best_model is not None