Skip to content

Commit

Permalink
Merge pull request #202 from alan-turing-institute/fold-strategy
Browse files Browse the repository at this point in the history
Fold strategy
  • Loading branch information
mastoffel authored Mar 7, 2024
2 parents 0f906d9 + ce04985 commit 07fe895
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 59 deletions.
38 changes: 8 additions & 30 deletions autoemulate/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
Expand All @@ -14,7 +15,6 @@

from autoemulate.cross_validate import _run_cv
from autoemulate.cross_validate import _update_scores_df
from autoemulate.cv import CV_REGISTRY
from autoemulate.data_splitting import _split_data
from autoemulate.emulators import MODEL_REGISTRY
from autoemulate.hyperparam_searching import _optimize_params
Expand Down Expand Up @@ -55,8 +55,7 @@ def setup(
scaler=StandardScaler(),
reduce_dim=False,
dim_reducer=PCA(),
fold_strategy="kfold",
folds=5,
cross_validator=KFold(n_splits=5, shuffle=True),
n_jobs=None,
model_subset=None,
log_to_file=False,
Expand Down Expand Up @@ -90,10 +89,9 @@ def setup(
explain 95% of the variance. Other methods can have slightly different n_components
parameter inputs, see the sklearn documentation for more details. Dimension reduction
is always performed after scaling.
fold_strategy : str
Cross-validation strategy, currently either "kfold" or "stratified_kfold".
folds : int
Number of folds.
cross_validator : sklearn.model_selection object
Cross-validation strategy to use. Defaults to KFold with 5 splits and shuffle=True.
Can be any object in `sklearn.model_selection` that generates train/test indices.
n_jobs : int
Number of jobs to run in parallel. `None` means 1, `-1` means using all processors.
model_subset : list
Expand All @@ -117,7 +115,7 @@ def setup(
dim_reducer,
)
self.metrics = self._get_metrics(METRIC_REGISTRY)
self.cv = self._get_cv(CV_REGISTRY, fold_strategy, folds)
self.cross_validator = cross_validator
self.param_search = param_search
self.search_type = param_search_type
self.param_search_iters = param_search_iters
Expand All @@ -128,7 +126,6 @@ def setup(
self.is_set_up = True
self.dim_reducer = dim_reducer
self.reduce_dim = reduce_dim
self.folds = folds
self.cv_results = {}

self.print_setup()
Expand Down Expand Up @@ -170,25 +167,6 @@ def _get_metrics(self, METRIC_REGISTRY):
"""
return [metric for metric in METRIC_REGISTRY.values()]

def _get_cv(self, CV_REGISTRY, fold_strategy, folds):
"""Get cross-validation strategy from REGISTRY
Parameters
----------
CV_REGISTRY : dict
Registry of cross-validation strategies.
fold_strategy : str
Name of the cross-validation strategy. Currently only "kfold" is supported.
folds : int
Number of folds.
Returns
-------
cv : sklearn.model_selection.KFold
An instance of the KFold class.
"""
return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True)

def compare(self):
"""Compares the emulator models on the data. self.setup() must be run first.
Expand Down Expand Up @@ -220,7 +198,7 @@ def compare(self):
self.models[i] = _optimize_params(
X=self.X[self.train_idxs],
y=self.y[self.train_idxs],
cv=self.cv,
cv=self.cross_validator,
model=self.models[i],
search_type=self.search_type,
niter=self.param_search_iters,
Expand All @@ -233,7 +211,7 @@ def compare(self):
fitted_model, cv_results = _run_cv(
X=self.X[self.train_idxs],
y=self.y[self.train_idxs],
cv=self.cv,
cv=self.cross_validator,
model=self.models[i],
metrics=self.metrics,
n_jobs=self.n_jobs,
Expand Down
23 changes: 0 additions & 23 deletions autoemulate/cv.py

This file was deleted.

12 changes: 7 additions & 5 deletions autoemulate/printing.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,23 +97,25 @@ def _print_setup(cls):
if cls.dim_reducer is not None
else "None"
),
str(cls.cv.__class__.__name__ if cls.cv is not None else "None"),
str(cls.folds),
str(
cls.cross_validator.__class__.__name__
if cls.cross_validator is not None
else "None"
),
str(cls.n_jobs if cls.n_jobs is not None else "1"),
],
index=[
"Simulation input shape (X)",
"Simulation output shape (y)",
"# test set samples (test_set_size)",
"# hold-out set samples (test_set_size)",
"Do hyperparameter search (param_search)",
"Type of hyperparameter search (search_type)",
"# sampled parameter settings (param_search_iters)",
"Scale data before fitting (scale)",
"Scaler (scaler)",
"Dimensionality reduction before fitting (reduce_dim)",
"Dimensionality reduction method (dim_reducer)",
"Cross-validation strategy (fold_strategy)",
"# folds (folds)",
"Cross-validation strategy (cross_validator)",
"# parallel jobs (n_jobs)",
],
columns=["Values"],
Expand Down
1 change: 0 additions & 1 deletion tests/test_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from sklearn.preprocessing import StandardScaler

from autoemulate.compare import AutoEmulate
from autoemulate.cv import CV_REGISTRY
from autoemulate.emulators import GaussianProcessMOGP
from autoemulate.emulators import MODEL_REGISTRY
from autoemulate.emulators import RandomForest
Expand Down
59 changes: 59 additions & 0 deletions tests/test_ui.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import numpy as np
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from autoemulate.compare import AutoEmulate

# take fast fitting models for testing
model_subset = ["SecondOrderPolynomial", "RadialBasisFunctions"]


def test_scalers():
X = np.random.rand(100, 5)
y = np.random.rand(100, 1)

scalers = [MinMaxScaler(), RobustScaler()]

for scaler in scalers:
ae = AutoEmulate()
ae.setup(X, y, scaler=scaler, model_subset=model_subset)
ae.compare()
ae.print_results()

assert ae.best_model is not None


def test_dimension_reducers():
X = np.random.rand(100, 10)
y = np.random.rand(100, 1)

dim_reducers = [PCA(n_components=5), KernelPCA(n_components=5)]

for dim_reducer in dim_reducers:
ae = AutoEmulate()
ae.setup(
X, y, reduce_dim=True, dim_reducer=dim_reducer, model_subset=model_subset
)
ae.compare()
ae.print_results()

assert ae.best_model is not None


def test_cross_validators():
X = np.random.rand(100, 5)
y = np.random.rand(100, 1)

cross_validators = [KFold(n_splits=5), TimeSeriesSplit(n_splits=5)]

for cross_validator in cross_validators:
ae = AutoEmulate()
ae.setup(X, y, cross_validator=cross_validator, model_subset=model_subset)
ae.compare()
ae.print_results()

assert ae.best_model is not None

0 comments on commit 07fe895

Please sign in to comment.