From e76fae522b09c5fb33a02fc2e03beb14005eadc0 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Fri, 9 Feb 2024 15:10:25 +0000 Subject: [PATCH 1/8] add split --- autoemulate/compare.py | 72 +++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/autoemulate/compare.py b/autoemulate/compare.py index cc26c428..d44dad2b 100644 --- a/autoemulate/compare.py +++ b/autoemulate/compare.py @@ -1,8 +1,11 @@ import matplotlib.pyplot as plt +import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.metrics import make_scorer from sklearn.model_selection import cross_validate +from sklearn.model_selection import PredefinedSplit +from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.utils.validation import check_X_y @@ -88,6 +91,9 @@ def setup( Whether to log to file. """ self.X, self.y = self._check_input(X, y) + self.train_idxs, self.test_idxs = self._split_data( + self.X, test_size=0.2, param_search=param_search + ) self.models = get_and_process_models( MODEL_REGISTRY, model_subset, @@ -130,6 +136,37 @@ def _check_input(self, X, y): y = y.astype("float32") # needed for pytorch models return X, y + def _split_data(self, X, test_size=0.2, random_state=None, param_search=False): + """Splits the data into training and testing sets. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Simulation input. + test_size : float, default=0.2 + Proportion of the dataset to include in the test split. + random_state : int, RandomState instance or None, default=None + Controls the shuffling applied to the data before applying the split. + param_search : bool + Whether to split the data for hyperparameter search. + + Returns + ------- + train_idx : array-like + Indices of the training set. + test_idx : array-like + Indices of the testing set. + """ + + if param_search: + idxs = np.arange(X.shape[0]) + train_idxs, test_idxs = train_test_split( + idxs, test_size=test_size, random_state=random_state + ) + else: + train_idxs, test_idxs = None, None + return train_idxs, test_idxs + def _get_metrics(self, METRIC_REGISTRY): """ Get metrics from REGISTRY @@ -187,8 +224,8 @@ def compare(self): # hyperparameter search if self.param_search: self.models[i] = optimize_params( - X=self.X, - y=self.y, + X=self.X[self.train_idxs], + y=self.y[self.train_idxs], cv=self.cv, model=self.models[i], search_type=self.search_type, @@ -197,16 +234,27 @@ def compare(self): n_jobs=self.n_jobs, logger=self.logger, ) - # run cross validation and store results - self.cv_results[get_model_name(self.models[i])] = run_cv( - X=self.X, - y=self.y, - cv=self.cv, - model=self.models[i], - metrics=self.metrics, - n_jobs=self.n_jobs, - logger=self.logger, - ) + # only predict on one test set + self.cv_results[get_model_name(self.models[i])] = run_cv( + X=self.X, + y=self.y, + cv=PredefinedSplit(test_fold=self.test_idxs), + model=self.models[i], + metrics=self.metrics, + n_jobs=self.n_jobs, + logger=self.logger, + ) + else: + # run cross validation and store results + self.cv_results[get_model_name(self.models[i])] = run_cv( + X=self.X, + y=self.y, + cv=self.cv, + model=self.models[i], + metrics=self.metrics, + n_jobs=self.n_jobs, + logger=self.logger, + ) # update scores dataframe self.scores_df = update_scores_df( self.scores_df, From 2fe14cee89c1c0953d06f91bb03f74fb1c5ef0c7 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Fri, 9 Feb 2024 15:57:55 +0000 Subject: [PATCH 2/8] fix indexing --- autoemulate/compare.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/autoemulate/compare.py b/autoemulate/compare.py index d44dad2b..07cd4098 100644 --- a/autoemulate/compare.py +++ b/autoemulate/compare.py @@ -202,6 +202,12 @@ def _get_cv(self, CV_REGISTRY, fold_strategy, folds): """ return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True) + def _custom_split(self, X, test_idxs): + split_index = np.full(X.shape[0], -1) + split_index[test_idxs] = 0 + + return PredefinedSplit(test_fold=split_index) + def compare(self): """Compares the emulator models on the data. self.setup() must be run first. @@ -238,12 +244,13 @@ def compare(self): self.cv_results[get_model_name(self.models[i])] = run_cv( X=self.X, y=self.y, - cv=PredefinedSplit(test_fold=self.test_idxs), + cv=self._custom_split(self.X, self.test_idxs), model=self.models[i], metrics=self.metrics, n_jobs=self.n_jobs, logger=self.logger, ) + print(f"cv output: {self.cv_results[get_model_name(self.models[i])]}") else: # run cross validation and store results self.cv_results[get_model_name(self.models[i])] = run_cv( From 16d5d606ba8a4988c9869b2946f09b5bdc505b12 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Fri, 9 Feb 2024 16:59:17 +0000 Subject: [PATCH 3/8] refactor single split --- autoemulate/compare.py | 47 ++++------------------------ autoemulate/cross_validate.py | 58 +++++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 43 deletions(-) diff --git a/autoemulate/compare.py b/autoemulate/compare.py index 07cd4098..b531d4cd 100644 --- a/autoemulate/compare.py +++ b/autoemulate/compare.py @@ -10,6 +10,8 @@ from sklearn.utils.validation import check_X_y from autoemulate.cross_validate import run_cv +from autoemulate.cross_validate import single_split +from autoemulate.cross_validate import split_data from autoemulate.cross_validate import update_scores_df from autoemulate.cv import CV_REGISTRY from autoemulate.emulators import MODEL_REGISTRY @@ -91,7 +93,7 @@ def setup( Whether to log to file. """ self.X, self.y = self._check_input(X, y) - self.train_idxs, self.test_idxs = self._split_data( + self.train_idxs, self.test_idxs = split_data( self.X, test_size=0.2, param_search=param_search ) self.models = get_and_process_models( @@ -136,37 +138,6 @@ def _check_input(self, X, y): y = y.astype("float32") # needed for pytorch models return X, y - def _split_data(self, X, test_size=0.2, random_state=None, param_search=False): - """Splits the data into training and testing sets. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Simulation input. - test_size : float, default=0.2 - Proportion of the dataset to include in the test split. - random_state : int, RandomState instance or None, default=None - Controls the shuffling applied to the data before applying the split. - param_search : bool - Whether to split the data for hyperparameter search. - - Returns - ------- - train_idx : array-like - Indices of the training set. - test_idx : array-like - Indices of the testing set. - """ - - if param_search: - idxs = np.arange(X.shape[0]) - train_idxs, test_idxs = train_test_split( - idxs, test_size=test_size, random_state=random_state - ) - else: - train_idxs, test_idxs = None, None - return train_idxs, test_idxs - def _get_metrics(self, METRIC_REGISTRY): """ Get metrics from REGISTRY @@ -202,12 +173,6 @@ def _get_cv(self, CV_REGISTRY, fold_strategy, folds): """ return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True) - def _custom_split(self, X, test_idxs): - split_index = np.full(X.shape[0], -1) - split_index[test_idxs] = 0 - - return PredefinedSplit(test_fold=split_index) - def compare(self): """Compares the emulator models on the data. self.setup() must be run first. @@ -240,17 +205,17 @@ def compare(self): n_jobs=self.n_jobs, logger=self.logger, ) - # only predict on one test set + self.cv_results[get_model_name(self.models[i])] = run_cv( X=self.X, y=self.y, - cv=self._custom_split(self.X, self.test_idxs), + cv=single_split(self.X, self.test_idxs), # predict on test set model=self.models[i], metrics=self.metrics, n_jobs=self.n_jobs, logger=self.logger, ) - print(f"cv output: {self.cv_results[get_model_name(self.models[i])]}") + else: # run cross validation and store results self.cv_results[get_model_name(self.models[i])] = run_cv( diff --git a/autoemulate/cross_validate.py b/autoemulate/cross_validate.py index 1ff0d843..2022b0f0 100644 --- a/autoemulate/cross_validate.py +++ b/autoemulate/cross_validate.py @@ -1,12 +1,14 @@ +import numpy as np import pandas as pd from sklearn.metrics import make_scorer from sklearn.model_selection import cross_validate +from sklearn.model_selection import PredefinedSplit +from sklearn.model_selection import train_test_split from autoemulate.utils import get_model_name def run_cv(X, y, cv, model, metrics, n_jobs, logger): - # Get model name model_name = get_model_name(model) # The metrics we want to use for cross-validation @@ -16,7 +18,6 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger): logger.info(f"Parameters: {model.named_steps['model'].get_params()}") try: - # Cross-validate cv_results = cross_validate( model, X, @@ -65,3 +66,56 @@ def update_scores_df(scores_df, model, cv_results): "score": score, } return scores_df + + +def split_data(X, test_size=0.2, random_state=None, param_search=False): + """Splits the data into training and testing sets. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Simulation input. + test_size : float, default=0.2 + Proportion of the dataset to include in the test split. + random_state : int, RandomState instance or None, default=None + Controls the shuffling applied to the data before applying the split. + param_search : bool + Whether to split the data for hyperparameter search. + + Returns + ------- + train_idx : array-like + Indices of the training set. + test_idx : array-like + Indices of the testing set. + """ + + if param_search: + idxs = np.arange(X.shape[0]) + train_idxs, test_idxs = train_test_split( + idxs, test_size=test_size, random_state=random_state + ) + else: + train_idxs, test_idxs = None, None + return train_idxs, test_idxs + + +def single_split(X, test_idxs): + """Create a single split for sklearn's `cross_validate` function. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Simulation input. + test_idxs : array-like + Indices of the testing set. + + Returns + ------- + split_index : sklearn.model_selection.PredefinedSplit + An instance of the PredefinedSplit class. + """ + split_index = np.full(X.shape[0], -1) + split_index[test_idxs] = 0 + + return PredefinedSplit(test_fold=split_index) From 5ec602f1c4d90787371113fd2c8d58d9f2d43973 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Fri, 9 Feb 2024 17:26:28 +0000 Subject: [PATCH 4/8] change printing text --- autoemulate/compare.py | 11 +++++++++-- autoemulate/printing.py | 32 +++++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/autoemulate/compare.py b/autoemulate/compare.py index b531d4cd..fc0542c1 100644 --- a/autoemulate/compare.py +++ b/autoemulate/compare.py @@ -40,6 +40,7 @@ def setup( param_search=False, param_search_type="random", param_search_iters=20, + param_search_test_size=0.2, scale=True, scaler=StandardScaler(), reduce_dim=False, @@ -94,7 +95,7 @@ def setup( """ self.X, self.y = self._check_input(X, y) self.train_idxs, self.test_idxs = split_data( - self.X, test_size=0.2, param_search=param_search + self.X, test_size=param_search_test_size, param_search=param_search ) self.models = get_and_process_models( MODEL_REGISTRY, @@ -306,7 +307,13 @@ def print_results(self, sort_by="r2", model=None): The name of the model to print. If None, the best fold from each model will be printed. If a model name is provided, the scores for that model across all folds will be printed. """ - print_cv_results(self.models, self.scores_df, model=model, sort_by=sort_by) + print_cv_results( + self.models, + self.scores_df, + model=model, + sort_by=sort_by, + param_search=self.param_search, + ) def plot_results( self, diff --git a/autoemulate/printing.py b/autoemulate/printing.py index c74e4da1..f801c994 100644 --- a/autoemulate/printing.py +++ b/autoemulate/printing.py @@ -2,7 +2,7 @@ from autoemulate.utils import get_model_name -def print_cv_results(models, scores_df, model=None, sort_by="r2"): +def print_cv_results(models, scores_df, model=None, sort_by="r2", param_search=False): """Print cv results. Parameters @@ -26,12 +26,26 @@ def print_cv_results(models, scores_df, model=None, sort_by="r2"): f"Model {model} not found. Available models are: {model_names}" ) if model is None: - means = get_mean_scores(scores_df, metric=sort_by) - print("Average scores across all models:") - print(means) + if param_search: + means = get_mean_scores(scores_df, metric=sort_by) + print("Test score for each model:") + print(means) + else: + means = get_mean_scores(scores_df, metric=sort_by) + print("Average scores across all models:") + print(means) else: - scores = scores_df[scores_df["model"] == model].pivot( - index="fold", columns="metric", values="score" - ) - print(f"Scores for {model} across all folds:") - print(scores) + if param_search: + scores = scores_df[scores_df["model"] == model].pivot( + index="fold", columns="metric", values="score" + ) + # drop metric column + # get index of scores + print(f"Test score for {model}:") + print(scores) + else: + scores = scores_df[scores_df["model"] == model].pivot( + index="fold", columns="metric", values="score" + ) + print(f"Scores for {model} across all folds:") + print(scores) From 4b14b75ebde1727b7e907048e15a42c2eb333911 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Mon, 12 Feb 2024 08:47:09 +0000 Subject: [PATCH 5/8] modify printing to fit the new hold-out set strategy --- autoemulate/compare.py | 6 +++--- autoemulate/printing.py | 11 +++-------- autoemulate/utils.py | 8 ++++++++ 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/autoemulate/compare.py b/autoemulate/compare.py index fc0542c1..f5fcc6a5 100644 --- a/autoemulate/compare.py +++ b/autoemulate/compare.py @@ -296,16 +296,16 @@ def load_model(self, filepath): serialiser = ModelSerialiser() return serialiser.load_model(filepath) - def print_results(self, sort_by="r2", model=None): + def print_results(self, model=None, sort_by="r2"): """Print cv results. Parameters ---------- - sort_by : str, optional - The metric to sort by. Default is "r2", can also be "rmse". model : str, optional The name of the model to print. If None, the best fold from each model will be printed. If a model name is provided, the scores for that model across all folds will be printed. + sort_by : str, optional + The metric to sort by. Default is "r2", can also be "rmse". """ print_cv_results( self.models, diff --git a/autoemulate/printing.py b/autoemulate/printing.py index f801c994..a05eaeb0 100644 --- a/autoemulate/printing.py +++ b/autoemulate/printing.py @@ -1,5 +1,6 @@ from autoemulate.utils import get_mean_scores from autoemulate.utils import get_model_name +from autoemulate.utils import get_model_scores def print_cv_results(models, scores_df, model=None, sort_by="r2", param_search=False): @@ -36,16 +37,10 @@ def print_cv_results(models, scores_df, model=None, sort_by="r2", param_search=F print(means) else: if param_search: - scores = scores_df[scores_df["model"] == model].pivot( - index="fold", columns="metric", values="score" - ) - # drop metric column - # get index of scores + scores = get_model_scores(scores_df, model) print(f"Test score for {model}:") print(scores) else: - scores = scores_df[scores_df["model"] == model].pivot( - index="fold", columns="metric", values="score" - ) + scores = get_model_scores(scores_df, model) print(f"Scores for {model} across all folds:") print(scores) diff --git a/autoemulate/utils.py b/autoemulate/utils.py index 948d5a2a..190c1454 100644 --- a/autoemulate/utils.py +++ b/autoemulate/utils.py @@ -308,6 +308,14 @@ def get_mean_scores(scores_df, metric): return means_df +def get_model_scores(scores_df, model_name): + model_scores = scores_df[scores_df["model"] == model_name].pivot( + index="fold", columns="metric", values="score" + ) + + return model_scores + + def set_random_seed(seed: int, deterministic: bool = False): """Set random seed for Python, Numpy and PyTorch. Args: From 44d81431238c4708510bf08794555f3e03ad147f Mon Sep 17 00:00:00 2001 From: mastoffel Date: Mon, 12 Feb 2024 09:12:59 +0000 Subject: [PATCH 6/8] change plotting text for test set plots --- autoemulate/compare.py | 8 ++++---- autoemulate/plotting.py | 27 ++++++++++++++++++++++++--- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/autoemulate/compare.py b/autoemulate/compare.py index f5fcc6a5..d0977b60 100644 --- a/autoemulate/compare.py +++ b/autoemulate/compare.py @@ -317,7 +317,7 @@ def print_results(self, model=None, sort_by="r2"): def plot_results( self, - model_name=None, + model=None, plot_type="actual_vs_predicted", n_cols=3, figsize=None, @@ -327,8 +327,7 @@ def plot_results( Parameters ---------- - model_name : str - + model : str Name of the model to plot. If None, plots best folds of each models. If a model name is specified, plots all folds of that model. plot_type : str, optional @@ -346,9 +345,10 @@ def plot_results( self.cv_results, self.X, self.y, - model_name=model_name, + model_name=model, n_cols=n_cols, plot_type=plot_type, figsize=figsize, output_index=output_index, + param_search=self.param_search, ) diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py index 2984126c..ed54f993 100644 --- a/autoemulate/plotting.py +++ b/autoemulate/plotting.py @@ -46,6 +46,7 @@ def plot_single_fold( plot_type="actual_vs_predicted", annotation=" ", output_index=0, + param_search=False, ): """Plots a single cv fold for a given model. @@ -89,7 +90,8 @@ def plot_single_fold( display = PredictionErrorDisplay.from_predictions( y_true=true_values, y_pred=predicted_values, kind=plot_type, ax=ax ) - ax.set_title(f"{model_name} - {annotation}: {fold_index}") + title_suffix = "Test set" if param_search else f"{annotation}: {fold_index}" + ax.set_title(f"{model_name} - {title_suffix}") def plot_best_fold_per_model( @@ -100,6 +102,7 @@ def plot_best_fold_per_model( plot_type="actual_vs_predicted", figsize=None, output_index=0, + param_search=False, ): """Plots results of the best (highest R^2) cv-fold for each model in cv_results. @@ -145,6 +148,7 @@ def plot_best_fold_per_model( plot_type=plot_type, annotation="Best CV-fold", output_index=output_index, + param_search=param_search, ) plt.tight_layout() plt.show() @@ -159,6 +163,7 @@ def plot_model_folds( plot_type="actual_vs_predicted", figsize=None, output_index=0, + param_search=False, ): """Plots all the folds for a given model. @@ -182,6 +187,8 @@ def plot_model_folds( Overrides the default figure size. output_index : int, optional The index of the output to plot. Default is 0. + param_search : bool, optional + Whether there was a hyperparameter search. """ n_folds = len(cv_results[model_name]["estimator"]) @@ -206,6 +213,7 @@ def plot_model_folds( plot_type, annotation="CV-fold", output_index=output_index, + param_search=param_search, ) plt.tight_layout() plt.show() @@ -220,6 +228,7 @@ def plot_results( plot_type="actual_vs_predicted", figsize=None, output_index=0, + param_search=False, ): """Plots the results of cross-validation. @@ -241,6 +250,10 @@ def plot_results( “residual_vs_predicted” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis). figsize : tuple, optional Overrides the default figure size. + output_index : int, optional + For multi-output: Index of the output variable to plot. + param_search : bool, optional + Whether hyperparameter search was done. """ validate_inputs(cv_results, model_name) @@ -248,9 +261,17 @@ def plot_results( if model_name: plot_model_folds( - cv_results, X, y, model_name, n_cols, plot_type, figsize, output_index + cv_results, + X, + y, + model_name, + n_cols, + plot_type, + figsize, + output_index, + param_search, ) else: plot_best_fold_per_model( - cv_results, X, y, n_cols, plot_type, figsize, output_index + cv_results, X, y, n_cols, plot_type, figsize, output_index, param_search ) From d97a25596118c7e4e3c526174946968203cfb95e Mon Sep 17 00:00:00 2001 From: mastoffel Date: Mon, 12 Feb 2024 09:19:09 +0000 Subject: [PATCH 7/8] cleanup --- autoemulate/plotting.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/autoemulate/plotting.py b/autoemulate/plotting.py index ed54f993..26001a44 100644 --- a/autoemulate/plotting.py +++ b/autoemulate/plotting.py @@ -133,8 +133,6 @@ def plot_best_fold_per_model( plt.figure(figsize=figsize) - if n_models == 1: - axes = [axes] for i, model_name in enumerate(cv_results): best_fold_index = np.argmax(cv_results[model_name]["test_r2"]) ax = plt.subplot(n_rows, n_cols, i + 1) @@ -199,8 +197,6 @@ def plot_model_folds( plt.figure(figsize=figsize) - if n_folds == 1: - axes = [axes] for i in range(n_folds): ax = plt.subplot(n_rows, n_cols, i + 1) plot_single_fold( From c36ecc8c24d1bffee81add5ca42d99bd230058a2 Mon Sep 17 00:00:00 2001 From: mastoffel Date: Mon, 12 Feb 2024 09:35:53 +0000 Subject: [PATCH 8/8] add tests for hold-out set splitting --- tests/test_cross_validate.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_cross_validate.py b/tests/test_cross_validate.py index dd23d7a7..041330d3 100644 --- a/tests/test_cross_validate.py +++ b/tests/test_cross_validate.py @@ -6,15 +6,20 @@ import pytest from sklearn.datasets import make_regression from sklearn.model_selection import KFold +from sklearn.model_selection import PredefinedSplit +from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from autoemulate.compare import AutoEmulate from autoemulate.cross_validate import run_cv +from autoemulate.cross_validate import single_split +from autoemulate.cross_validate import split_data from autoemulate.cross_validate import update_scores_df from autoemulate.emulators import RandomForest from autoemulate.metrics import METRIC_REGISTRY + # import make_regression_data X, y = make_regression(n_samples=20, n_features=2, random_state=0) @@ -53,3 +58,26 @@ def test_update_scores_df(cv_results): assert scores_df_new.shape[0] == 10 assert scores_df_new.shape[1] == 4 assert scores_df_new["model"][0] == "RandomForest" + + +def test_single_split(): + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + test_idxs = [1, 3] + split_index = single_split(X, test_idxs) + + assert isinstance(split_index, PredefinedSplit) + assert np.array_equal(split_index.test_fold, [-1, 0, -1, 0]) + + +def test_split_data(): + X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + test_size = 0.2 + random_state = 42 + param_search = True + + train_idxs, test_idxs = split_data(X, test_size, random_state, param_search) + + assert isinstance(train_idxs, np.ndarray) + assert isinstance(test_idxs, np.ndarray) + assert len(train_idxs) == 3 + assert len(test_idxs) == 1