Merge pull request #155 from alan-turing-institute/test-set-split

Test set split
alan-turing-institute · Feb 12, 2024 · 0139be9 · 0139be9
2 parents a70aa6e + c36ecc8
commit 0139be9
Show file tree

Hide file tree

Showing 6 changed files with 181 additions and 38 deletions.
diff --git a/autoemulate/compare.py b/autoemulate/compare.py
@@ -1,12 +1,17 @@
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
 from sklearn.decomposition import PCA
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
+from sklearn.model_selection import PredefinedSplit
+from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils.validation import check_X_y
 
 from autoemulate.cross_validate import run_cv
+from autoemulate.cross_validate import single_split
+from autoemulate.cross_validate import split_data
 from autoemulate.cross_validate import update_scores_df
 from autoemulate.cv import CV_REGISTRY
 from autoemulate.emulators import MODEL_REGISTRY
@@ -35,6 +40,7 @@ def setup(
         param_search=False,
         param_search_type="random",
         param_search_iters=20,
+        param_search_test_size=0.2,
         scale=True,
         scaler=StandardScaler(),
         reduce_dim=False,
@@ -88,6 +94,9 @@ def setup(
             Whether to log to file.
         """
         self.X, self.y = self._check_input(X, y)
+        self.train_idxs, self.test_idxs = split_data(
+            self.X, test_size=param_search_test_size, param_search=param_search
+        )
         self.models = get_and_process_models(
             MODEL_REGISTRY,
             model_subset,
@@ -187,8 +196,8 @@ def compare(self):
             # hyperparameter search
             if self.param_search:
                 self.models[i] = optimize_params(
-                    X=self.X,
-                    y=self.y,
+                    X=self.X[self.train_idxs],
+                    y=self.y[self.train_idxs],
                     cv=self.cv,
                     model=self.models[i],
                     search_type=self.search_type,
@@ -197,16 +206,28 @@ def compare(self):
                     n_jobs=self.n_jobs,
                     logger=self.logger,
                 )
-            # run cross validation and store results
-            self.cv_results[get_model_name(self.models[i])] = run_cv(
-                X=self.X,
-                y=self.y,
-                cv=self.cv,
-                model=self.models[i],
-                metrics=self.metrics,
-                n_jobs=self.n_jobs,
-                logger=self.logger,
-            )
+
+                self.cv_results[get_model_name(self.models[i])] = run_cv(
+                    X=self.X,
+                    y=self.y,
+                    cv=single_split(self.X, self.test_idxs),  # predict on test set
+                    model=self.models[i],
+                    metrics=self.metrics,
+                    n_jobs=self.n_jobs,
+                    logger=self.logger,
+                )
+
+            else:
+                # run cross validation and store results
+                self.cv_results[get_model_name(self.models[i])] = run_cv(
+                    X=self.X,
+                    y=self.y,
+                    cv=self.cv,
+                    model=self.models[i],
+                    metrics=self.metrics,
+                    n_jobs=self.n_jobs,
+                    logger=self.logger,
+                )
             # update scores dataframe
             self.scores_df = update_scores_df(
                 self.scores_df,
@@ -275,22 +296,28 @@ def load_model(self, filepath):
         serialiser = ModelSerialiser()
         return serialiser.load_model(filepath)
 
-    def print_results(self, sort_by="r2", model=None):
+    def print_results(self, model=None, sort_by="r2"):
         """Print cv results.
 
         Parameters
         ----------
-        sort_by : str, optional
-            The metric to sort by. Default is "r2", can also be "rmse".
         model : str, optional
             The name of the model to print. If None, the best fold from each model will be printed.
             If a model name is provided, the scores for that model across all folds will be printed.
+        sort_by : str, optional
+            The metric to sort by. Default is "r2", can also be "rmse".
         """
-        print_cv_results(self.models, self.scores_df, model=model, sort_by=sort_by)
+        print_cv_results(
+            self.models,
+            self.scores_df,
+            model=model,
+            sort_by=sort_by,
+            param_search=self.param_search,
+        )
 
     def plot_results(
         self,
-        model_name=None,
+        model=None,
         plot_type="actual_vs_predicted",
         n_cols=3,
         figsize=None,
@@ -300,8 +327,7 @@ def plot_results(
 
         Parameters
         ----------
-        model_name : str
-
+        model : str
             Name of the model to plot. If None, plots best folds of each models.
             If a model name is specified, plots all folds of that model.
         plot_type : str, optional
@@ -319,9 +345,10 @@ def plot_results(
             self.cv_results,
             self.X,
             self.y,
-            model_name=model_name,
+            model_name=model,
             n_cols=n_cols,
             plot_type=plot_type,
             figsize=figsize,
             output_index=output_index,
+            param_search=self.param_search,
         )
diff --git a/autoemulate/cross_validate.py b/autoemulate/cross_validate.py
@@ -1,12 +1,14 @@
+import numpy as np
 import pandas as pd
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
+from sklearn.model_selection import PredefinedSplit
+from sklearn.model_selection import train_test_split
 
 from autoemulate.utils import get_model_name
 
 
 def run_cv(X, y, cv, model, metrics, n_jobs, logger):
-    # Get model name
     model_name = get_model_name(model)
 
     # The metrics we want to use for cross-validation
@@ -16,7 +18,6 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
     logger.info(f"Parameters: {model.named_steps['model'].get_params()}")
 
     try:
-        # Cross-validate
         cv_results = cross_validate(
             model,
             X,
@@ -65,3 +66,56 @@ def update_scores_df(scores_df, model, cv_results):
                     "score": score,
                 }
     return scores_df
+
+
+def split_data(X, test_size=0.2, random_state=None, param_search=False):
+    """Splits the data into training and testing sets.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Simulation input.
+    test_size : float, default=0.2
+        Proportion of the dataset to include in the test split.
+    random_state : int, RandomState instance or None, default=None
+        Controls the shuffling applied to the data before applying the split.
+    param_search : bool
+        Whether to split the data for hyperparameter search.
+
+    Returns
+    -------
+    train_idx : array-like
+        Indices of the training set.
+    test_idx : array-like
+        Indices of the testing set.
+    """
+
+    if param_search:
+        idxs = np.arange(X.shape[0])
+        train_idxs, test_idxs = train_test_split(
+            idxs, test_size=test_size, random_state=random_state
+        )
+    else:
+        train_idxs, test_idxs = None, None
+    return train_idxs, test_idxs
+
+
+def single_split(X, test_idxs):
+    """Create a single split for sklearn's `cross_validate` function.
+
+    Parameters
+    ----------
+    X : array-like, shape (n_samples, n_features)
+        Simulation input.
+    test_idxs : array-like
+        Indices of the testing set.
+
+    Returns
+    -------
+    split_index : sklearn.model_selection.PredefinedSplit
+        An instance of the PredefinedSplit class.
+    """
+    split_index = np.full(X.shape[0], -1)
+    split_index[test_idxs] = 0
+
+    return PredefinedSplit(test_fold=split_index)
diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py
@@ -46,6 +46,7 @@ def plot_single_fold(
     plot_type="actual_vs_predicted",
     annotation=" ",
     output_index=0,
+    param_search=False,
 ):
     """Plots a single cv fold for a given model.
 
@@ -89,7 +90,8 @@ def plot_single_fold(
     display = PredictionErrorDisplay.from_predictions(
         y_true=true_values, y_pred=predicted_values, kind=plot_type, ax=ax
     )
-    ax.set_title(f"{model_name} - {annotation}: {fold_index}")
+    title_suffix = "Test set" if param_search else f"{annotation}: {fold_index}"
+    ax.set_title(f"{model_name} - {title_suffix}")
 
 
 def plot_best_fold_per_model(
@@ -100,6 +102,7 @@ def plot_best_fold_per_model(
     plot_type="actual_vs_predicted",
     figsize=None,
     output_index=0,
+    param_search=False,
 ):
     """Plots results of the best (highest R^2) cv-fold for each model in cv_results.
 
@@ -130,8 +133,6 @@ def plot_best_fold_per_model(
 
     plt.figure(figsize=figsize)
 
-    if n_models == 1:
-        axes = [axes]
     for i, model_name in enumerate(cv_results):
         best_fold_index = np.argmax(cv_results[model_name]["test_r2"])
         ax = plt.subplot(n_rows, n_cols, i + 1)
@@ -145,6 +146,7 @@ def plot_best_fold_per_model(
             plot_type=plot_type,
             annotation="Best CV-fold",
             output_index=output_index,
+            param_search=param_search,
         )
     plt.tight_layout()
     plt.show()
@@ -159,6 +161,7 @@ def plot_model_folds(
     plot_type="actual_vs_predicted",
     figsize=None,
     output_index=0,
+    param_search=False,
 ):
     """Plots all the folds for a given model.
 
@@ -182,6 +185,8 @@ def plot_model_folds(
         Overrides the default figure size.
     output_index : int, optional
         The index of the output to plot. Default is 0.
+    param_search : bool, optional
+        Whether there was a hyperparameter search.
     """
 
     n_folds = len(cv_results[model_name]["estimator"])
@@ -192,8 +197,6 @@ def plot_model_folds(
 
     plt.figure(figsize=figsize)
 
-    if n_folds == 1:
-        axes = [axes]
     for i in range(n_folds):
         ax = plt.subplot(n_rows, n_cols, i + 1)
         plot_single_fold(
@@ -206,6 +209,7 @@ def plot_model_folds(
             plot_type,
             annotation="CV-fold",
             output_index=output_index,
+            param_search=param_search,
         )
     plt.tight_layout()
     plt.show()
@@ -220,6 +224,7 @@ def plot_results(
     plot_type="actual_vs_predicted",
     figsize=None,
     output_index=0,
+    param_search=False,
 ):
     """Plots the results of cross-validation.
 
@@ -241,16 +246,28 @@ def plot_results(
         “residual_vs_predicted” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
     figsize : tuple, optional
         Overrides the default figure size.
+    output_index : int, optional
+        For multi-output: Index of the output variable to plot.
+    param_search : bool, optional
+        Whether hyperparameter search was done.
     """
 
     validate_inputs(cv_results, model_name)
     check_multioutput(y, output_index)
 
     if model_name:
         plot_model_folds(
-            cv_results, X, y, model_name, n_cols, plot_type, figsize, output_index
+            cv_results,
+            X,
+            y,
+            model_name,
+            n_cols,
+            plot_type,
+            figsize,
+            output_index,
+            param_search,
         )
     else:
         plot_best_fold_per_model(
-            cv_results, X, y, n_cols, plot_type, figsize, output_index
+            cv_results, X, y, n_cols, plot_type, figsize, output_index, param_search
         )
diff --git a/autoemulate/printing.py b/autoemulate/printing.py
@@ -1,8 +1,9 @@
 from autoemulate.utils import get_mean_scores
 from autoemulate.utils import get_model_name
+from autoemulate.utils import get_model_scores
 
 
-def print_cv_results(models, scores_df, model=None, sort_by="r2"):
+def print_cv_results(models, scores_df, model=None, sort_by="r2", param_search=False):
     """Print cv results.
 
     Parameters
@@ -26,12 +27,20 @@ def print_cv_results(models, scores_df, model=None, sort_by="r2"):
                 f"Model {model} not found. Available models are: {model_names}"
             )
     if model is None:
-        means = get_mean_scores(scores_df, metric=sort_by)
-        print("Average scores across all models:")
-        print(means)
+        if param_search:
+            means = get_mean_scores(scores_df, metric=sort_by)
+            print("Test score for each model:")
+            print(means)
+        else:
+            means = get_mean_scores(scores_df, metric=sort_by)
+            print("Average scores across all models:")
+            print(means)
     else:
-        scores = scores_df[scores_df["model"] == model].pivot(
-            index="fold", columns="metric", values="score"
-        )
-        print(f"Scores for {model} across all folds:")
-        print(scores)
+        if param_search:
+            scores = get_model_scores(scores_df, model)
+            print(f"Test score for {model}:")
+            print(scores)
+        else:
+            scores = get_model_scores(scores_df, model)
+            print(f"Scores for {model} across all folds:")
+            print(scores)
diff --git a/autoemulate/utils.py b/autoemulate/utils.py
@@ -308,6 +308,14 @@ def get_mean_scores(scores_df, metric):
     return means_df
 
 
+def get_model_scores(scores_df, model_name):
+    model_scores = scores_df[scores_df["model"] == model_name].pivot(
+        index="fold", columns="metric", values="score"
+    )
+
+    return model_scores
+
+
 def set_random_seed(seed: int, deterministic: bool = False):
     """Set random seed for Python, Numpy and PyTorch.
     Args: