Skip to content

Commit

Permalink
refactor cv and update scores df into separate functions
Browse files Browse the repository at this point in the history
refactor cv and update scores df into separate functions

switch arguments

add tests for cv
  • Loading branch information
mastoffel committed Jan 31, 2024
1 parent dfeb81f commit cbb966c
Show file tree
Hide file tree
Showing 4 changed files with 142 additions and 88 deletions.
105 changes: 19 additions & 86 deletions autoemulate/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_X_y

from autoemulate.cross_validate import run_cv
from autoemulate.cross_validate import update_scores_df
from autoemulate.cv import CV_REGISTRY
from autoemulate.emulators import MODEL_REGISTRY
from autoemulate.hyperparam_searching import optimize_params
Expand Down Expand Up @@ -181,6 +183,7 @@ def compare(self):
)

for i in range(len(self.models)):
# hyperparameter search
if self.use_grid_search:
self.models[i] = optimize_params(
X=self.X,
Expand All @@ -193,96 +196,26 @@ def compare(self):
n_jobs=self.n_jobs,
logger=self.logger,
)
self._cross_validate(self.models[i])

# returns best model fitted on full data
self.best_model = self._get_best_model(metric="r2")
return self.best_model

def _cross_validate(self, model):
"""Perform cross-validation on a given model using the specified metrics.
Parameters
----------
model: A scikit-learn estimator object.
Class attributes used
---------------------
self.X : array-like, shape (n_samples, n_features)
Simulation input.
self.y : array-like, shape (n_samples, n_outputs)
Simulation output.
self.cv : scikit-learn cross-validation object
Cross-validation strategy.
self.metrics : list of str
List of metrics to use for cross-validation.
self.n_jobs : int
Number of jobs to run in parallel. `None` means 1, `-1` means using all processors.
Returns
-------
scores_df : pandas.DataFrame
Updates dataframe containing the cv scores the model.
"""

# Get model name
model_name = get_model_name(model)

# The metrics we want to use for cross-validation
scorers = {metric.__name__: make_scorer(metric) for metric in self.metrics}

self.logger.info(f"Cross-validating {model_name}...")
self.logger.info(f"Parameters: {model.named_steps['model'].get_params()}")

try:
# Cross-validate
cv_results = cross_validate(
model,
self.X,
self.y,
# run cross validation and store results
self.cv_results[get_model_name(self.models[i])] = run_cv(
X=self.X,
y=self.y,
cv=self.cv,
scoring=scorers,
model=self.models[i],
metrics=self.metrics,
n_jobs=self.n_jobs,
return_estimator=True,
return_indices=True,
logger=self.logger,
)
# update scores dataframe
self.scores_df = update_scores_df(
self.scores_df,
self.models[i],
self.cv_results[get_model_name(self.models[i])],
)
# updates pandas dataframe with model cv scores
self._update_scores_df(model_name, cv_results)
# save results for plotting etc.
self.cv_results[model_name] = cv_results

except Exception as e:
self.logger.error(f"Failed to cross-validate {model_name}")
self.logger.error(e)

def _update_scores_df(self, model_name, cv_results):
"""Updates the scores dataframe with the results of the cross-validation.
Parameters
----------
model_name : str
Name of the model.
cv_results : dict
Results of the cross-validation.
Returns
-------
None
Modifies the self.scores_df DataFrame in-place.

"""
# Gather scores from each metric
# Initialise scores dataframe
for key in cv_results.keys():
if key.startswith("test_"):
for fold, score in enumerate(cv_results[key]):
self.scores_df.loc[len(self.scores_df.index)] = {
"model": model_name,
"metric": key.split("test_", 1)[1],
"fold": fold,
"score": score,
}
# returns best model fitted on full data
self.best_model = self._get_best_model(metric="r2")
return self.best_model

def _get_best_model(self, metric="r2"):
"""Determine the best model using average cv score
Expand Down
67 changes: 67 additions & 0 deletions autoemulate/cross_validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

from autoemulate.utils import get_model_name


def run_cv(X, y, cv, model, metrics, n_jobs, logger):
# Get model name
model_name = get_model_name(model)

# The metrics we want to use for cross-validation
scorers = {metric.__name__: make_scorer(metric) for metric in metrics}

logger.info(f"Cross-validating {model_name}...")
logger.info(f"Parameters: {model.named_steps['model'].get_params()}")

try:
# Cross-validate
cv_results = cross_validate(
model,
X,
y,
cv=cv,
scoring=scorers,
n_jobs=n_jobs,
return_estimator=True,
return_indices=True,
)

except Exception as e:
logger.error(f"Failed to cross-validate {model_name}")
logger.error(e)

return cv_results


def update_scores_df(scores_df, model, cv_results):
"""Updates the scores dataframe with the results of the cross-validation.
Parameters
----------
scores_df : pandas.DataFrame
DataFrame with columns "model", "metric", "fold", "score".
model_name : str
Name of the model.
cv_results : dict
Results of the cross-validation.
Returns
-------
None
Modifies the self.scores_df DataFrame in-place.
"""
# Gather scores from each metric
# Initialise scores dataframe
for key in cv_results.keys():
if key.startswith("test_"):
for fold, score in enumerate(cv_results[key]):
scores_df.loc[len(scores_df.index)] = {
"model": get_model_name(model),
"metric": key.split("test_", 1)[1],
"fold": fold,
"score": score,
}
return scores_df
3 changes: 1 addition & 2 deletions autoemulate/hyperparam_searching.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ def optimize_params(
Returns
-------
searcher : sklearn.model_selection._search.BaseSearchCV or skopt.searchcv.BayesSearchCV
Searcher instance.
Refitted estimator on the whole dataset with best parameters.
"""
model_name = get_model_name(model)
logger.info(f"Performing grid search for {model_name}...")
Expand Down
55 changes: 55 additions & 0 deletions tests/test_cross_validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import logging
from typing import List

import numpy as np
import pandas as pd
import pytest
from sklearn.datasets import make_regression
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from autoemulate.compare import AutoEmulate
from autoemulate.cross_validate import run_cv
from autoemulate.cross_validate import update_scores_df
from autoemulate.emulators import RandomForest
from autoemulate.metrics import METRIC_REGISTRY

# import make_regression_data

X, y = make_regression(n_samples=20, n_features=2, random_state=0)
cv = KFold(n_splits=5, shuffle=True)
model = Pipeline([("scaler", StandardScaler()), ("model", RandomForest())])
metrics = [metric for metric in METRIC_REGISTRY.values()]
n_jobs = 1
logger = logging.getLogger(__name__)
scores_df = pd.DataFrame(columns=["model", "metric", "fold", "score"]).astype(
{"model": "object", "metric": "object", "fold": "int64", "score": "float64"}
)


@pytest.fixture()
def cv_results():
return run_cv(X, y, cv, model, metrics, n_jobs, logger)


def test_cv(cv_results):
assert isinstance(cv_results, dict)
# check that it contains scores
assert "test_r2" in cv_results.keys()
assert "test_rsme" in cv_results.keys()

assert isinstance(cv_results["test_r2"], np.ndarray)
assert isinstance(cv_results["test_rsme"], np.ndarray)

assert len(cv_results["test_r2"]) == 5
assert len(cv_results["test_rsme"]) == 5


def test_update_scores_df(cv_results):
scores_df_new = update_scores_df(scores_df, model, cv_results)
assert isinstance(scores_df_new, pd.DataFrame)

assert scores_df_new.shape[0] == 10
assert scores_df_new.shape[1] == 4
assert scores_df_new["model"][0] == "RandomForest"

0 comments on commit cbb966c

Please sign in to comment.