Skip to content

Commit

Permalink
fix merge conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
mastoffel committed Feb 27, 2024
2 parents 27e51c1 + 6416e51 commit 80c746a
Show file tree
Hide file tree
Showing 34 changed files with 1,065 additions and 332 deletions.
17 changes: 12 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

<!-- SPHINX-START -->

Simulations of physical systems are often slow and need lots of compute, which makes them unpractical for applications like digital twins, or in situations where they have to run thousands of times, like sensitivity analyses. The goal of `AutoEmulate` is to make it easy to replace simulations with fast, accurate emulators. To do this, `AutoEmulate` automatically fits and compares lots of models, like *Radial Basis Functions*, *Gaussian Processes* or *Neural Networks* to find the best emulator for a simulation.
Simulations of physical systems are often slow and need lots of compute, which makes them unpractical for applications like digital twins, or when they have to run thousands of times to do uncertainty quantification or sensitivity analyses. The goal of `AutoEmulate` is to make it easy to replace simulations with fast, accurate emulators. To do this, `AutoEmulate` automatically fits and compares lots of models, like *Radial Basis Functions*, *Gaussian Processes* or *Neural Networks* to find the best emulator for a simulation.

The project is in very early development.

Expand Down Expand Up @@ -46,14 +46,21 @@ y = np.array([simulator(x) for x in X])
# compare emulator models
ae = AutoEmulate()
ae.setup(X, y)
ae.compare()
best_model = ae.compare()

# evaluate
# training set cv results
ae.print_results()
ae.plot_results()

# predict on test set
ae.evaluate_model(best_model)

# refit on full data
best_emulator = ae.refit_model(best_model)

# save & load best model
ae.save_model("best_model")
best_emulator = ae.load_model("best_model")
ae.save_model("best_emulator")
best_emulator = ae.load_model("best_emulator")

# emulate
best_emulator.predict(X)
Expand Down
117 changes: 80 additions & 37 deletions autoemulate/compare.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.validation import check_X_y

from autoemulate.cross_validate import run_cv
from autoemulate.cross_validate import update_scores_df
from autoemulate.cross_validate import _run_cv
from autoemulate.cross_validate import _update_scores_df
from autoemulate.cv import CV_REGISTRY
from autoemulate.data_splitting import split_data
from autoemulate.data_splitting import _split_data
from autoemulate.emulators import MODEL_REGISTRY
from autoemulate.hyperparam_searching import optimize_params
from autoemulate.logging_config import configure_logging
from autoemulate.hyperparam_searching import _optimize_params
from autoemulate.logging_config import _configure_logging
from autoemulate.metrics import METRIC_REGISTRY
from autoemulate.model_processing import get_and_process_models
from autoemulate.model_processing import _get_and_process_models
from autoemulate.plotting import _plot_model
from autoemulate.plotting import _plot_results
from autoemulate.printing import _print_cv_results
Expand Down Expand Up @@ -101,10 +103,10 @@ def setup(
Whether to log to file.
"""
self.X, self.y = self._check_input(X, y)
self.train_idxs, self.test_idxs = split_data(
self.train_idxs, self.test_idxs = _split_data(
self.X, test_size=test_set_size, random_state=42
)
self.models = get_and_process_models(
self.models = _get_and_process_models(
MODEL_REGISTRY,
model_subset,
self.y,
Expand All @@ -121,7 +123,7 @@ def setup(
self.scale = scale
self.scaler = scaler
self.n_jobs = n_jobs
self.logger = configure_logging(log_to_file=log_to_file)
self.logger = _configure_logging(log_to_file=log_to_file)
self.is_set_up = True
self.cv_results = {}

Expand Down Expand Up @@ -200,36 +202,41 @@ def compare(self):
)

for i in range(len(self.models)):
# hyperparameter search
if self.param_search:
self.models[i] = optimize_params(
try:
# hyperparameter search
if self.param_search:
self.models[i] = _optimize_params(
X=self.X[self.train_idxs],
y=self.y[self.train_idxs],
cv=self.cv,
model=self.models[i],
search_type=self.search_type,
niter=self.param_search_iters,
param_space=None,
n_jobs=self.n_jobs,
logger=self.logger,
)

# run cross validation
fitted_model, cv_results = _run_cv(
X=self.X[self.train_idxs],
y=self.y[self.train_idxs],
cv=self.cv,
model=self.models[i],
search_type=self.search_type,
niter=self.param_search_iters,
param_space=None,
metrics=self.metrics,
n_jobs=self.n_jobs,
logger=self.logger,
)

# run cross validation
fitted_model, cv_results = run_cv(
X=self.X[self.train_idxs],
y=self.y[self.train_idxs],
cv=self.cv,
model=self.models[i],
metrics=self.metrics,
n_jobs=self.n_jobs,
logger=self.logger,
)
except Exception as e:
print(f"Error fitting model {get_model_name(self.models[i])}")
print(e) # should be replaced with logging
continue

self.models[i] = fitted_model
self.cv_results[get_model_name(self.models[i])] = cv_results

# update scores dataframe
self.scores_df = update_scores_df(
self.scores_df = _update_scores_df(
self.scores_df,
self.models[i],
self.cv_results[get_model_name(self.models[i])],
Expand Down Expand Up @@ -304,26 +311,62 @@ def refit_model(self, model):
model.fit(self.X, self.y)
return model

def save_model(self, model=None, filepath=None):
"""Saves the best model to disk."""
def refit_models(self):
"""(Re-) fits all models on the full data.
Returns
-------
models : list
List of refitted models.
"""
if not hasattr(self, "X"):
raise RuntimeError("Must run setup() before refit_models()")
for i in range(len(self.models)):
self.models[i] = self.refit_model(self.models[i])
return self.models

def save_model(self, model=None, path=None):
"""Saves model to disk.
Parameters
----------
model : object, optional
Model to save. If None, saves the best model.
If "all", saves all models.
path : str
Path to save the model.
"""
if not hasattr(self, "best_model"):
raise RuntimeError("Must run compare() before save_model()")
serialiser = ModelSerialiser()

if model is None:
model = self.best_model
if filepath is None:
raise ValueError("Filepath must be provided")
if model is None or not isinstance(model, (Pipeline, BaseEstimator)):
raise ValueError(
"Model must be provided and should be a scikit-learn pipeline or model"
)
serialiser._save_model(model, path)

def save_models(self, path=None):
"""Saves all models to disk.
serialiser.save_model(model, filepath)
Parameters
----------
path : str
Directory to save the models.
If None, saves to the current working directory.
"""
if not hasattr(self, "best_model"):
raise RuntimeError("Must run compare() before save_models()")
serialiser = ModelSerialiser()
serialiser._save_models(self.models, path)

def load_model(self, filepath=None):
def load_model(self, path=None):
"""Loads a model from disk."""
serialiser = ModelSerialiser()
if filepath is None:
if path is None:
raise ValueError("Filepath must be provided")

return serialiser.load_model(filepath)
return serialiser._load_model(path)

def print_results(self, model=None, sort_by="r2"):
"""Print cv results.
Expand Down
6 changes: 3 additions & 3 deletions autoemulate/cross_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from autoemulate.utils import get_model_name


def run_cv(X, y, cv, model, metrics, n_jobs, logger):
def _run_cv(X, y, cv, model, metrics, n_jobs, logger):
model_name = get_model_name(model)

# The metrics we want to use for cross-validation
Expand All @@ -17,6 +17,7 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
logger.info(f"Cross-validating {model_name}...")
logger.info(f"Parameters: {model.named_steps['model'].get_params()}")

cv_results = None
try:
cv_results = cross_validate(
model,
Expand All @@ -28,7 +29,6 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
return_estimator=True,
return_indices=True,
)

except Exception as e:
logger.error(f"Failed to cross-validate {model_name}")
logger.error(e)
Expand All @@ -39,7 +39,7 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
return fitted_model, cv_results


def update_scores_df(scores_df, model, cv_results):
def _update_scores_df(scores_df, model, cv_results):
"""Updates the scores dataframe with the results of the cross-validation.
Parameters
Expand Down
2 changes: 1 addition & 1 deletion autoemulate/data_splitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from sklearn.model_selection import train_test_split


def split_data(X, test_size=0.2, random_state=None):
def _split_data(X, test_size=0.2, random_state=None):
"""Splits the data into training and testing sets.
Parameters
Expand Down
18 changes: 9 additions & 9 deletions autoemulate/emulators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@

# REGISTRY keys are the class names (i.e. type(model).__name__)
MODEL_REGISTRY = {
"SecondOrderPolynomial": SecondOrderPolynomial,
"RBF": RBF,
"RandomForest": RandomForest,
"GradientBoosting": GradientBoosting,
"GaussianProcessSk": GaussianProcessSk,
"SupportVectorMachines": SupportVectorMachines,
"XGBoost": XGBoost,
"NeuralNetSk": NeuralNetSk,
"NeuralNetTorch": NeuralNetTorch,
"SecondOrderPolynomial": SecondOrderPolynomial(),
"RBF": RBF(),
"RandomForest": RandomForest(),
"GradientBoosting": GradientBoosting(),
"GaussianProcessSk": GaussianProcessSk(),
"SupportVectorMachines": SupportVectorMachines(),
"XGBoost": XGBoost(),
"NeuralNetSk": NeuralNetSk(),
"NeuralNetTorch": NeuralNetTorch(module="mlp"),
# "GaussianProcess": GaussianProcess,
}
4 changes: 2 additions & 2 deletions autoemulate/emulators/gaussian_process_sk.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from skopt.space import Integer
from skopt.space import Real

from autoemulate.utils import suppress_convergence_warnings
from autoemulate.utils import _suppress_convergence_warnings


class GaussianProcessSk(BaseEstimator, RegressorMixin):
Expand Down Expand Up @@ -68,7 +68,7 @@ def fit(self, X, y):
copy_X_train=self.copy_X_train,
random_state=self.random_state,
)
with suppress_convergence_warnings():
with _suppress_convergence_warnings():
self.model_.fit(X, y)
self.is_fitted_ = True
return self
Expand Down
4 changes: 2 additions & 2 deletions autoemulate/emulators/neural_net_sk.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from skopt.space import Categorical
from skopt.space import Real

from autoemulate.utils import suppress_convergence_warnings
from autoemulate.utils import _suppress_convergence_warnings


class NeuralNetSk(BaseEstimator, RegressorMixin):
Expand Down Expand Up @@ -72,7 +72,7 @@ def fit(self, X, y):
max_iter=self.max_iter,
random_state=self.random_state,
)
with suppress_convergence_warnings():
with _suppress_convergence_warnings():
self.model_.fit(X, y)
# expose n_iter_ attribute to be consistent with sklearn estimators
self.n_iter_ = self.model_.n_iter_
Expand Down
Loading

0 comments on commit 80c746a

Please sign in to comment.