Skip to content

Commit

Permalink
Merge pull request #155 from alan-turing-institute/test-set-split
Browse files Browse the repository at this point in the history
Test set split
  • Loading branch information
mastoffel authored Feb 12, 2024
2 parents a70aa6e + c36ecc8 commit 0139be9
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 38 deletions.
67 changes: 47 additions & 20 deletions autoemulate/compare.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_X_y

from autoemulate.cross_validate import run_cv
from autoemulate.cross_validate import single_split
from autoemulate.cross_validate import split_data
from autoemulate.cross_validate import update_scores_df
from autoemulate.cv import CV_REGISTRY
from autoemulate.emulators import MODEL_REGISTRY
Expand Down Expand Up @@ -35,6 +40,7 @@ def setup(
param_search=False,
param_search_type="random",
param_search_iters=20,
param_search_test_size=0.2,
scale=True,
scaler=StandardScaler(),
reduce_dim=False,
Expand Down Expand Up @@ -88,6 +94,9 @@ def setup(
Whether to log to file.
"""
self.X, self.y = self._check_input(X, y)
self.train_idxs, self.test_idxs = split_data(
self.X, test_size=param_search_test_size, param_search=param_search
)
self.models = get_and_process_models(
MODEL_REGISTRY,
model_subset,
Expand Down Expand Up @@ -187,8 +196,8 @@ def compare(self):
# hyperparameter search
if self.param_search:
self.models[i] = optimize_params(
X=self.X,
y=self.y,
X=self.X[self.train_idxs],
y=self.y[self.train_idxs],
cv=self.cv,
model=self.models[i],
search_type=self.search_type,
Expand All @@ -197,16 +206,28 @@ def compare(self):
n_jobs=self.n_jobs,
logger=self.logger,
)
# run cross validation and store results
self.cv_results[get_model_name(self.models[i])] = run_cv(
X=self.X,
y=self.y,
cv=self.cv,
model=self.models[i],
metrics=self.metrics,
n_jobs=self.n_jobs,
logger=self.logger,
)

self.cv_results[get_model_name(self.models[i])] = run_cv(
X=self.X,
y=self.y,
cv=single_split(self.X, self.test_idxs), # predict on test set
model=self.models[i],
metrics=self.metrics,
n_jobs=self.n_jobs,
logger=self.logger,
)

else:
# run cross validation and store results
self.cv_results[get_model_name(self.models[i])] = run_cv(
X=self.X,
y=self.y,
cv=self.cv,
model=self.models[i],
metrics=self.metrics,
n_jobs=self.n_jobs,
logger=self.logger,
)
# update scores dataframe
self.scores_df = update_scores_df(
self.scores_df,
Expand Down Expand Up @@ -275,22 +296,28 @@ def load_model(self, filepath):
serialiser = ModelSerialiser()
return serialiser.load_model(filepath)

def print_results(self, sort_by="r2", model=None):
def print_results(self, model=None, sort_by="r2"):
"""Print cv results.
Parameters
----------
sort_by : str, optional
The metric to sort by. Default is "r2", can also be "rmse".
model : str, optional
The name of the model to print. If None, the best fold from each model will be printed.
If a model name is provided, the scores for that model across all folds will be printed.
sort_by : str, optional
The metric to sort by. Default is "r2", can also be "rmse".
"""
print_cv_results(self.models, self.scores_df, model=model, sort_by=sort_by)
print_cv_results(
self.models,
self.scores_df,
model=model,
sort_by=sort_by,
param_search=self.param_search,
)

def plot_results(
self,
model_name=None,
model=None,
plot_type="actual_vs_predicted",
n_cols=3,
figsize=None,
Expand All @@ -300,8 +327,7 @@ def plot_results(
Parameters
----------
model_name : str
model : str
Name of the model to plot. If None, plots best folds of each models.
If a model name is specified, plots all folds of that model.
plot_type : str, optional
Expand All @@ -319,9 +345,10 @@ def plot_results(
self.cv_results,
self.X,
self.y,
model_name=model_name,
model_name=model,
n_cols=n_cols,
plot_type=plot_type,
figsize=figsize,
output_index=output_index,
param_search=self.param_search,
)
58 changes: 56 additions & 2 deletions autoemulate/cross_validate.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split

from autoemulate.utils import get_model_name


def run_cv(X, y, cv, model, metrics, n_jobs, logger):
# Get model name
model_name = get_model_name(model)

# The metrics we want to use for cross-validation
Expand All @@ -16,7 +18,6 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
logger.info(f"Parameters: {model.named_steps['model'].get_params()}")

try:
# Cross-validate
cv_results = cross_validate(
model,
X,
Expand Down Expand Up @@ -65,3 +66,56 @@ def update_scores_df(scores_df, model, cv_results):
"score": score,
}
return scores_df


def split_data(X, test_size=0.2, random_state=None, param_search=False):
"""Splits the data into training and testing sets.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Simulation input.
test_size : float, default=0.2
Proportion of the dataset to include in the test split.
random_state : int, RandomState instance or None, default=None
Controls the shuffling applied to the data before applying the split.
param_search : bool
Whether to split the data for hyperparameter search.
Returns
-------
train_idx : array-like
Indices of the training set.
test_idx : array-like
Indices of the testing set.
"""

if param_search:
idxs = np.arange(X.shape[0])
train_idxs, test_idxs = train_test_split(
idxs, test_size=test_size, random_state=random_state
)
else:
train_idxs, test_idxs = None, None
return train_idxs, test_idxs


def single_split(X, test_idxs):
"""Create a single split for sklearn's `cross_validate` function.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Simulation input.
test_idxs : array-like
Indices of the testing set.
Returns
-------
split_index : sklearn.model_selection.PredefinedSplit
An instance of the PredefinedSplit class.
"""
split_index = np.full(X.shape[0], -1)
split_index[test_idxs] = 0

return PredefinedSplit(test_fold=split_index)
31 changes: 24 additions & 7 deletions autoemulate/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def plot_single_fold(
plot_type="actual_vs_predicted",
annotation=" ",
output_index=0,
param_search=False,
):
"""Plots a single cv fold for a given model.
Expand Down Expand Up @@ -89,7 +90,8 @@ def plot_single_fold(
display = PredictionErrorDisplay.from_predictions(
y_true=true_values, y_pred=predicted_values, kind=plot_type, ax=ax
)
ax.set_title(f"{model_name} - {annotation}: {fold_index}")
title_suffix = "Test set" if param_search else f"{annotation}: {fold_index}"
ax.set_title(f"{model_name} - {title_suffix}")


def plot_best_fold_per_model(
Expand All @@ -100,6 +102,7 @@ def plot_best_fold_per_model(
plot_type="actual_vs_predicted",
figsize=None,
output_index=0,
param_search=False,
):
"""Plots results of the best (highest R^2) cv-fold for each model in cv_results.
Expand Down Expand Up @@ -130,8 +133,6 @@ def plot_best_fold_per_model(

plt.figure(figsize=figsize)

if n_models == 1:
axes = [axes]
for i, model_name in enumerate(cv_results):
best_fold_index = np.argmax(cv_results[model_name]["test_r2"])
ax = plt.subplot(n_rows, n_cols, i + 1)
Expand All @@ -145,6 +146,7 @@ def plot_best_fold_per_model(
plot_type=plot_type,
annotation="Best CV-fold",
output_index=output_index,
param_search=param_search,
)
plt.tight_layout()
plt.show()
Expand All @@ -159,6 +161,7 @@ def plot_model_folds(
plot_type="actual_vs_predicted",
figsize=None,
output_index=0,
param_search=False,
):
"""Plots all the folds for a given model.
Expand All @@ -182,6 +185,8 @@ def plot_model_folds(
Overrides the default figure size.
output_index : int, optional
The index of the output to plot. Default is 0.
param_search : bool, optional
Whether there was a hyperparameter search.
"""

n_folds = len(cv_results[model_name]["estimator"])
Expand All @@ -192,8 +197,6 @@ def plot_model_folds(

plt.figure(figsize=figsize)

if n_folds == 1:
axes = [axes]
for i in range(n_folds):
ax = plt.subplot(n_rows, n_cols, i + 1)
plot_single_fold(
Expand All @@ -206,6 +209,7 @@ def plot_model_folds(
plot_type,
annotation="CV-fold",
output_index=output_index,
param_search=param_search,
)
plt.tight_layout()
plt.show()
Expand All @@ -220,6 +224,7 @@ def plot_results(
plot_type="actual_vs_predicted",
figsize=None,
output_index=0,
param_search=False,
):
"""Plots the results of cross-validation.
Expand All @@ -241,16 +246,28 @@ def plot_results(
“residual_vs_predicted” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
figsize : tuple, optional
Overrides the default figure size.
output_index : int, optional
For multi-output: Index of the output variable to plot.
param_search : bool, optional
Whether hyperparameter search was done.
"""

validate_inputs(cv_results, model_name)
check_multioutput(y, output_index)

if model_name:
plot_model_folds(
cv_results, X, y, model_name, n_cols, plot_type, figsize, output_index
cv_results,
X,
y,
model_name,
n_cols,
plot_type,
figsize,
output_index,
param_search,
)
else:
plot_best_fold_per_model(
cv_results, X, y, n_cols, plot_type, figsize, output_index
cv_results, X, y, n_cols, plot_type, figsize, output_index, param_search
)
27 changes: 18 additions & 9 deletions autoemulate/printing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from autoemulate.utils import get_mean_scores
from autoemulate.utils import get_model_name
from autoemulate.utils import get_model_scores


def print_cv_results(models, scores_df, model=None, sort_by="r2"):
def print_cv_results(models, scores_df, model=None, sort_by="r2", param_search=False):
"""Print cv results.
Parameters
Expand All @@ -26,12 +27,20 @@ def print_cv_results(models, scores_df, model=None, sort_by="r2"):
f"Model {model} not found. Available models are: {model_names}"
)
if model is None:
means = get_mean_scores(scores_df, metric=sort_by)
print("Average scores across all models:")
print(means)
if param_search:
means = get_mean_scores(scores_df, metric=sort_by)
print("Test score for each model:")
print(means)
else:
means = get_mean_scores(scores_df, metric=sort_by)
print("Average scores across all models:")
print(means)
else:
scores = scores_df[scores_df["model"] == model].pivot(
index="fold", columns="metric", values="score"
)
print(f"Scores for {model} across all folds:")
print(scores)
if param_search:
scores = get_model_scores(scores_df, model)
print(f"Test score for {model}:")
print(scores)
else:
scores = get_model_scores(scores_df, model)
print(f"Scores for {model} across all folds:")
print(scores)
8 changes: 8 additions & 0 deletions autoemulate/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,14 @@ def get_mean_scores(scores_df, metric):
return means_df


def get_model_scores(scores_df, model_name):
model_scores = scores_df[scores_df["model"] == model_name].pivot(
index="fold", columns="metric", values="score"
)

return model_scores


def set_random_seed(seed: int, deterministic: bool = False):
"""Set random seed for Python, Numpy and PyTorch.
Args:
Expand Down
Loading

0 comments on commit 0139be9

Please sign in to comment.