fix merge conflicts

alan-turing-institute · Feb 27, 2024 · 80c746a · 80c746a
2 parents 27e51c1 + 6416e51
commit 80c746a
Show file tree

Hide file tree

Showing 34 changed files with 1,065 additions and 332 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 
 <!-- SPHINX-START -->
 
-Simulations of physical systems are often slow and need lots of compute, which makes them unpractical for applications like digital twins, or in situations where they have to run thousands of times, like sensitivity analyses. The goal of `AutoEmulate` is to make it easy to replace simulations with fast, accurate emulators. To do this, `AutoEmulate` automatically fits and compares lots of models, like *Radial Basis Functions*, *Gaussian Processes* or *Neural Networks* to find the best emulator for a simulation.
+Simulations of physical systems are often slow and need lots of compute, which makes them unpractical for applications like digital twins, or when they have to run thousands of times to do uncertainty quantification or sensitivity analyses. The goal of `AutoEmulate` is to make it easy to replace simulations with fast, accurate emulators. To do this, `AutoEmulate` automatically fits and compares lots of models, like *Radial Basis Functions*, *Gaussian Processes* or *Neural Networks* to find the best emulator for a simulation.
 
 The project is in very early development. 
 
@@ -46,14 +46,21 @@ y = np.array([simulator(x) for x in X])
 # compare emulator models
 ae = AutoEmulate()
 ae.setup(X, y)
-ae.compare() 
+best_model = ae.compare() 
 
-# evaluate
+# training set cv results
 ae.print_results()
+ae.plot_results()
+
+# predict on test set
+ae.evaluate_model(best_model)
+
+# refit on full data
+best_emulator = ae.refit_model(best_model)
 
 # save & load best model
-ae.save_model("best_model")
-best_emulator = ae.load_model("best_model")
+ae.save_model("best_emulator")
+best_emulator = ae.load_model("best_emulator")
 
 # emulate
 best_emulator.predict(X)

diff --git a/autoemulate/compare.py b/autoemulate/compare.py
@@ -1,24 +1,26 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+from sklearn.base import BaseEstimator
 from sklearn.decomposition import PCA
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
 from sklearn.model_selection import PredefinedSplit
 from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.validation import check_X_y
 
-from autoemulate.cross_validate import run_cv
-from autoemulate.cross_validate import update_scores_df
+from autoemulate.cross_validate import _run_cv
+from autoemulate.cross_validate import _update_scores_df
 from autoemulate.cv import CV_REGISTRY
-from autoemulate.data_splitting import split_data
+from autoemulate.data_splitting import _split_data
 from autoemulate.emulators import MODEL_REGISTRY
-from autoemulate.hyperparam_searching import optimize_params
-from autoemulate.logging_config import configure_logging
+from autoemulate.hyperparam_searching import _optimize_params
+from autoemulate.logging_config import _configure_logging
 from autoemulate.metrics import METRIC_REGISTRY
-from autoemulate.model_processing import get_and_process_models
+from autoemulate.model_processing import _get_and_process_models
 from autoemulate.plotting import _plot_model
 from autoemulate.plotting import _plot_results
 from autoemulate.printing import _print_cv_results
@@ -101,10 +103,10 @@ def setup(
             Whether to log to file.
         """
         self.X, self.y = self._check_input(X, y)
-        self.train_idxs, self.test_idxs = split_data(
+        self.train_idxs, self.test_idxs = _split_data(
             self.X, test_size=test_set_size, random_state=42
         )
-        self.models = get_and_process_models(
+        self.models = _get_and_process_models(
             MODEL_REGISTRY,
             model_subset,
             self.y,
@@ -121,7 +123,7 @@ def setup(
         self.scale = scale
         self.scaler = scaler
         self.n_jobs = n_jobs
-        self.logger = configure_logging(log_to_file=log_to_file)
+        self.logger = _configure_logging(log_to_file=log_to_file)
         self.is_set_up = True
         self.cv_results = {}
 
@@ -200,36 +202,41 @@ def compare(self):
         )
 
         for i in range(len(self.models)):
-            # hyperparameter search
-            if self.param_search:
-                self.models[i] = optimize_params(
+            try:
+                # hyperparameter search
+                if self.param_search:
+                    self.models[i] = _optimize_params(
+                        X=self.X[self.train_idxs],
+                        y=self.y[self.train_idxs],
+                        cv=self.cv,
+                        model=self.models[i],
+                        search_type=self.search_type,
+                        niter=self.param_search_iters,
+                        param_space=None,
+                        n_jobs=self.n_jobs,
+                        logger=self.logger,
+                    )
+
+                # run cross validation
+                fitted_model, cv_results = _run_cv(
                     X=self.X[self.train_idxs],
                     y=self.y[self.train_idxs],
                     cv=self.cv,
                     model=self.models[i],
-                    search_type=self.search_type,
-                    niter=self.param_search_iters,
-                    param_space=None,
+                    metrics=self.metrics,
                     n_jobs=self.n_jobs,
                     logger=self.logger,
                 )
-
-            # run cross validation
-            fitted_model, cv_results = run_cv(
-                X=self.X[self.train_idxs],
-                y=self.y[self.train_idxs],
-                cv=self.cv,
-                model=self.models[i],
-                metrics=self.metrics,
-                n_jobs=self.n_jobs,
-                logger=self.logger,
-            )
+            except Exception as e:
+                print(f"Error fitting model {get_model_name(self.models[i])}")
+                print(e)  # should be replaced with logging
+                continue
 
             self.models[i] = fitted_model
             self.cv_results[get_model_name(self.models[i])] = cv_results
 
             # update scores dataframe
-            self.scores_df = update_scores_df(
+            self.scores_df = _update_scores_df(
                 self.scores_df,
                 self.models[i],
                 self.cv_results[get_model_name(self.models[i])],
@@ -304,26 +311,62 @@ def refit_model(self, model):
         model.fit(self.X, self.y)
         return model
 
-    def save_model(self, model=None, filepath=None):
-        """Saves the best model to disk."""
+    def refit_models(self):
+        """(Re-) fits all models on the full data.
+
+        Returns
+        -------
+        models : list
+            List of refitted models.
+        """
+        if not hasattr(self, "X"):
+            raise RuntimeError("Must run setup() before refit_models()")
+        for i in range(len(self.models)):
+            self.models[i] = self.refit_model(self.models[i])
+        return self.models
+
+    def save_model(self, model=None, path=None):
+        """Saves model to disk.
+
+        Parameters
+        ----------
+        model : object, optional
+            Model to save. If None, saves the best model.
+            If "all", saves all models.
+        path : str
+            Path to save the model.
+        """
         if not hasattr(self, "best_model"):
             raise RuntimeError("Must run compare() before save_model()")
         serialiser = ModelSerialiser()
 
-        if model is None:
-            model = self.best_model
-        if filepath is None:
-            raise ValueError("Filepath must be provided")
+        if model is None or not isinstance(model, (Pipeline, BaseEstimator)):
+            raise ValueError(
+                "Model must be provided and should be a scikit-learn pipeline or model"
+            )
+        serialiser._save_model(model, path)
+
+    def save_models(self, path=None):
+        """Saves all models to disk.
 
-        serialiser.save_model(model, filepath)
+        Parameters
+        ----------
+        path : str
+            Directory to save the models.
+            If None, saves to the current working directory.
+        """
+        if not hasattr(self, "best_model"):
+            raise RuntimeError("Must run compare() before save_models()")
+        serialiser = ModelSerialiser()
+        serialiser._save_models(self.models, path)
 
-    def load_model(self, filepath=None):
+    def load_model(self, path=None):
         """Loads a model from disk."""
         serialiser = ModelSerialiser()
-        if filepath is None:
+        if path is None:
             raise ValueError("Filepath must be provided")
 
-        return serialiser.load_model(filepath)
+        return serialiser._load_model(path)
 
     def print_results(self, model=None, sort_by="r2"):
         """Print cv results.

diff --git a/autoemulate/cross_validate.py b/autoemulate/cross_validate.py
@@ -8,7 +8,7 @@
 from autoemulate.utils import get_model_name
 
 
-def run_cv(X, y, cv, model, metrics, n_jobs, logger):
+def _run_cv(X, y, cv, model, metrics, n_jobs, logger):
     model_name = get_model_name(model)
 
     # The metrics we want to use for cross-validation
@@ -17,6 +17,7 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
     logger.info(f"Cross-validating {model_name}...")
     logger.info(f"Parameters: {model.named_steps['model'].get_params()}")
 
+    cv_results = None
     try:
         cv_results = cross_validate(
             model,
@@ -28,7 +29,6 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
             return_estimator=True,
             return_indices=True,
         )
-
     except Exception as e:
         logger.error(f"Failed to cross-validate {model_name}")
         logger.error(e)
@@ -39,7 +39,7 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
     return fitted_model, cv_results
 
 
-def update_scores_df(scores_df, model, cv_results):
+def _update_scores_df(scores_df, model, cv_results):
     """Updates the scores dataframe with the results of the cross-validation.
 
     Parameters

diff --git a/autoemulate/data_splitting.py b/autoemulate/data_splitting.py
@@ -2,7 +2,7 @@
 from sklearn.model_selection import train_test_split
 
 
-def split_data(X, test_size=0.2, random_state=None):
+def _split_data(X, test_size=0.2, random_state=None):
     """Splits the data into training and testing sets.
 
     Parameters

diff --git a/autoemulate/emulators/__init__.py b/autoemulate/emulators/__init__.py
@@ -11,14 +11,14 @@
 
 # REGISTRY keys are the class names (i.e. type(model).__name__)
 MODEL_REGISTRY = {
-    "SecondOrderPolynomial": SecondOrderPolynomial,
-    "RBF": RBF,
-    "RandomForest": RandomForest,
-    "GradientBoosting": GradientBoosting,
-    "GaussianProcessSk": GaussianProcessSk,
-    "SupportVectorMachines": SupportVectorMachines,
-    "XGBoost": XGBoost,
-    "NeuralNetSk": NeuralNetSk,
-    "NeuralNetTorch": NeuralNetTorch,
+    "SecondOrderPolynomial": SecondOrderPolynomial(),
+    "RBF": RBF(),
+    "RandomForest": RandomForest(),
+    "GradientBoosting": GradientBoosting(),
+    "GaussianProcessSk": GaussianProcessSk(),
+    "SupportVectorMachines": SupportVectorMachines(),
+    "XGBoost": XGBoost(),
+    "NeuralNetSk": NeuralNetSk(),
+    "NeuralNetTorch": NeuralNetTorch(module="mlp"),
     # "GaussianProcess": GaussianProcess,
 }
diff --git a/autoemulate/emulators/gaussian_process_sk.py b/autoemulate/emulators/gaussian_process_sk.py
@@ -13,7 +13,7 @@
 from skopt.space import Integer
 from skopt.space import Real
 
-from autoemulate.utils import suppress_convergence_warnings
+from autoemulate.utils import _suppress_convergence_warnings
 
 
 class GaussianProcessSk(BaseEstimator, RegressorMixin):
@@ -68,7 +68,7 @@ def fit(self, X, y):
             copy_X_train=self.copy_X_train,
             random_state=self.random_state,
         )
-        with suppress_convergence_warnings():
+        with _suppress_convergence_warnings():
             self.model_.fit(X, y)
         self.is_fitted_ = True
         return self

diff --git a/autoemulate/emulators/neural_net_sk.py b/autoemulate/emulators/neural_net_sk.py
@@ -9,7 +9,7 @@
 from skopt.space import Categorical
 from skopt.space import Real
 
-from autoemulate.utils import suppress_convergence_warnings
+from autoemulate.utils import _suppress_convergence_warnings
 
 
 class NeuralNetSk(BaseEstimator, RegressorMixin):
@@ -72,7 +72,7 @@ def fit(self, X, y):
             max_iter=self.max_iter,
             random_state=self.random_state,
         )
-        with suppress_convergence_warnings():
+        with _suppress_convergence_warnings():
             self.model_.fit(X, y)
         # expose n_iter_ attribute to be consistent with sklearn estimators
         self.n_iter_ = self.model_.n_iter_