Merge pull request #35 from alan-turing-institute/scikit-learn-estima…

…tors convert models to scikit learn estimators
alan-turing-institute · Oct 30, 2023 · ecac96e · ecac96e
2 parents ee19589 + ac90661
commit ecac96e
Show file tree

Hide file tree

Showing 15 changed files with 541 additions and 505 deletions.
diff --git a/autoemulate/__init__.py b/autoemulate/__init__.py
@@ -0,0 +1,9 @@
+import logging
+
+logging.basicConfig(
+    level=logging.INFO,
+    # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    format="%(message)s",
+    # filename='autoemulate.log'
+)
+logging.captureWarnings(True)
diff --git a/autoemulate/compare.py b/autoemulate/compare.py
@@ -1,16 +1,24 @@
-from sklearn.model_selection import KFold
-from autoemulate.experimental_design import LatinHypercube
 from autoemulate.metrics import METRIC_REGISTRY
 from autoemulate.emulators import MODEL_REGISTRY
 from autoemulate.cv import CV_REGISTRY
+from autoemulate.logging_config import configure_logging
+
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 
+# impoprt check_X_y from sklearn
+from sklearn.utils.validation import check_X_y
+from sklearn.model_selection import cross_val_score
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import GridSearchCV
+import logging
+
+verbose = 0  # or 1, based on user input or command-line argument
+configure_logging(verbose)
 
-class AutoEmulate:
-    """Automatically compares emulators."""
 
+class AutoEmulate:
     def __init__(self):
         """Initializes an AutoEmulate object."""
         self.X = None
@@ -21,9 +29,8 @@ def __init__(self):
             {"model": "object", "metric": "object", "fold": "int64", "score": "float64"}
         )
         self.is_set_up = False
-        self.predictions_data = {}
 
-    def setup(self, X, y, fold_strategy="kfold", folds=5):
+    def setup(self, X, y, hyperparameter_search=False, fold_strategy="kfold", folds=5):
         """Sets up the AutoEmulate object.
 
         Parameters
@@ -38,236 +45,71 @@ def setup(self, X, y, fold_strategy="kfold", folds=5):
             Number of folds.
 
         """
-        self._check_data(X, y)
-        self._preprocess_data(X, y)
-        self.models = [
-            MODEL_REGISTRY[model_name]() for model_name in MODEL_REGISTRY.keys()
-        ]
-        self.metrics = [metric_name for metric_name in METRIC_REGISTRY.keys()]
+        self.X, self.y = check_X_y(X, y, multi_output=True, y_numeric=True)
+        self.models = [model() for model in MODEL_REGISTRY.values()]
+        self.metrics = [metric for metric in METRIC_REGISTRY.keys()]
         self.cv = CV_REGISTRY[fold_strategy](folds=folds, shuffle=True)
         self.is_set_up = True
+        self.hyperparameter_search = hyperparameter_search
+        self.logger = logging.getLogger(type(self).__name__)
 
     def compare(self):
-        """Compares the emulators."""
         if not self.is_set_up:
             raise RuntimeError("Must run setup() before compare()")
 
-        print(f"Starting {self.cv}-fold cross-validation...")
-
         for model in self.models:
             model_name = type(model).__name__
-            print(f"Training {model_name}...")
-            self._score_model_with_cv(model)
 
-    def print_scores(self, model=None):
-        """Prints the scores of the emulators.
+            if self.hyperparameter_search:
+                self.perform_hyperparameter_search_for_model(model)
 
-        Parameters
-        ----------
-        model : str, optional
-            If model is None, prints the average scores across all models.
-            Otherwise, prints the scores for the specified model across folds.
+            self.logger.info(f"Training {model_name}...")
+            self.logger.info(f"Parameters: {model.get_params()}")
+            for metric_name in self.metrics:
+                scorer = make_scorer(METRIC_REGISTRY[metric_name])
+                scores = cross_val_score(
+                    model, self.X, self.y, cv=self.cv, scoring=scorer
+                )
+                for fold, score in enumerate(scores):
+                    new_row = pd.DataFrame(
+                        {
+                            "model": [model_name],
+                            "metric": [metric_name],
+                            "fold": [fold],
+                            "score": [score],
+                        }
+                    )
+                    self.scores_df = pd.concat(
+                        [self.scores_df, new_row], ignore_index=True
+                    )
+
+    def perform_hyperparameter_search_for_model(self, model):
+        model_name = type(model).__name__
+        self.logger.info(f"Performing grid search for {model_name}...")
+        param_grid = (
+            model.get_grid_params()
+        )  # Assumes that each model has a `get_grid_params` method
+        grid_search = GridSearchCV(model, param_grid, cv=self.cv)
+        grid_search.fit(self.X, self.y)
+        best_params = grid_search.best_params_
+        self.logger.info(f"Best parameters for {model_name}: {best_params}")
+        model.set_params(**best_params)  # Update the model with the best parameters
 
-        """
+    def print_scores(self, model=None):
         if model is None:
             means = (
                 self.scores_df.groupby(["model", "metric"])["score"]
                 .mean()
                 .unstack()
                 .reset_index()
-                .sort_values(by="r2", ascending=False)
             )
             print("Average scores across all models:")
-            print(means.to_string(index=False))
-
+            print(means)
         else:
-            specific_model_scores = self.scores_df[self.scores_df["model"] == model]
-            folds = (
-                specific_model_scores.groupby(["metric", "fold"])["score"]
-                .mean()
-                .unstack()
-                .transpose()
+            scores = (
+                self.scores_df[self.scores_df["model"] == model]
+                .pivot(index="fold", columns="metric", values="score")
+                .pipe(print)
             )
-            folds.columns.name = None
-            folds.index.name = None
-            folds.loc["Mean"] = folds.mean()
-            folds.loc["Std Dev"] = folds.std()
             print(f"Scores for {model} across all folds:")
-            print(folds.to_string())
-
-    def _check_data(self, X, y):
-        """Validates data.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Simulation input.
-        y : array-like, shape (n_samples, n_outputs)
-            Simulation output.
-        """
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("X and y must have the same number of samples.")
-        if np.isnan(X).any() or np.isnan(y).any():
-            raise ValueError("X and y should not contain NaNs.")
-
-    def _preprocess_data(self, X, y):
-        """Preprocesses data.
-
-        Parameters
-        ----------
-        X : array-like, shape (n_samples, n_features)
-            Simulation input.
-        y : array-like, shape (n_samples, n_outputs)
-            Simulation output.
-
-        """
-        self.X = np.array(X)
-        self.y = np.array(y)
-
-    def _train_model(self, model, X, y):
-        """Trains the model.
-
-        Parameters
-        ----------
-        model : object
-            The model to train.
-        X : array-like, shape (n_samples, n_features)
-            Simulation input.
-
-        Returns
-        -------
-        model : object
-            The trained model.
-        """
-        model.fit(X, y)
-        return model
-
-    def _evaluate_model(self, trained_model, X, y):
-        """Evaluates the model.
-
-        Parameters
-        ----------
-        trained_model : object
-            The trained model.
-        X : array-like, shape (n_samples, n_features)
-            Simulation input.
-        y : array-like, shape (n_samples, n_outputs)
-            Simulation output.
-
-        Returns
-        -------
-        scores : dict
-            The scores of the model.
-        """
-        scores = {}
-        for metric in self.metrics:
-            metric_func = METRIC_REGISTRY[metric]
-            score = trained_model.score(X, y, metric=metric_func)
-            scores[metric] = score
-        return scores
-
-    def _score_model_with_cv(self, model):
-        """Scores the model using cross-validation.
-
-        Parameters
-        ----------
-        model : object
-            The model to score.
-
-        Returns
-        -------
-        scores_df : pandas.DataFrame
-            The scores of the model.
-        """
-        cv = self.cv
-        model_name = type(model).__name__
-        if model_name not in self.predictions_data:
-            self.predictions_data[model_name] = {}
-
-        for fold, (train_index, test_index) in enumerate(cv.split(self.X)):
-            X_train, X_test = self.X[train_index], self.X[test_index]
-            y_train, y_test = self.y[train_index], self.y[test_index]
-
-            trained_model = self._train_model(model, X_train, y_train)
-            fold_scores = self._evaluate_model(trained_model, X_test, y_test)
-
-            y_pred = trained_model.predict(X_test)
-            self.predictions_data[model_name][fold] = {
-                "y_true": y_test,
-                "y_pred": y_pred,
-            }
-
-            for metric, score in fold_scores.items():
-                new_row = pd.DataFrame(
-                    {
-                        "model": [model_name],
-                        "metric": [metric],
-                        "fold": [fold],  # Now correctly included
-                        "score": [score],
-                    }
-                )
-                self.scores_df = pd.concat([self.scores_df, new_row], ignore_index=True)
-
-    def plot_predictions(self, model=None):
-        """Plot predictions vs ground truth based on the condition.
-
-        Parameters
-        ----------
-        model : str, optional
-            Name of the model to plot. If None, plots the best fold for each model.
-        """
-        if not self.predictions_data:
-            print("No prediction data available for plotting.")
-            return
-
-        if model:
-            # Plot all folds for the specified model
-            if model not in self.predictions_data:
-                print(f"No prediction data available for model {model}.")
-                return
-
-            model_data = self.predictions_data[model]
-            num_folds = len(model_data)
-
-            fig, axes = plt.subplots(1, num_folds, figsize=(15, 3))
-            if num_folds == 1:
-                axes = [axes]  # Make it iterable
-
-            for fold, ax in zip(model_data.keys(), axes):
-                data = model_data[fold]
-                y_true = np.array(data["y_true"])
-                y_pred = np.array(data["y_pred"])
-
-                ax.scatter(y_true, y_pred, alpha=0.5)
-                ax.set_xlabel("Simulation output")
-                ax.set_ylabel("Predictions")
-                ax.set_title(f"Model: {model}, Fold: {fold}")
-
-        else:
-            # Plot the best fold for each model
-            best_folds = (
-                self.scores_df[self.scores_df["metric"] == "r2"]
-                .groupby(["model"])["score"]
-                .idxmax()
-                .map(lambda x: self.scores_df.loc[x, "fold"])
-                .to_dict()
-            )
-
-            fig, axes = plt.subplots(1, len(best_folds), figsize=(20, 3))
-            if len(best_folds) == 1:
-                axes = [axes]  # Make it iterable
-
-            for ax, (model, best_fold) in zip(axes, best_folds.items()):
-                model_data = self.predictions_data.get(model, {})
-                data = model_data.get(best_fold, {})
-
-                y_true = np.array(data.get("y_true", []))
-                y_pred = np.array(data.get("y_pred", []))
-
-                ax.scatter(y_true, y_pred, alpha=0.5)
-                ax.set_xlabel("Simulation output")
-                ax.set_ylabel("Predictions")
-                ax.set_title(f"Model: {model}, Best Fold: {best_fold}")
-
-        plt.tight_layout()
-        plt.show()
+            print(scores)
diff --git a/autoemulate/emulators/__init__.py b/autoemulate/emulators/__init__.py
@@ -1,12 +1,12 @@
 from .base import Emulator
 from .gaussian_process import GaussianProcess
-from .gaussian_process2 import GaussianProcess2
+from .gaussian_process_sk import GaussianProcessSk
 from .neural_network import NeuralNetwork
 from .random_forest import RandomForest
 
 MODEL_REGISTRY = {
     "GaussianProcess": GaussianProcess,
-    "GaussianProcess2": GaussianProcess2,
+    "GaussianProcessSk": GaussianProcessSk,
     "NeuralNetwork": NeuralNetwork,
     "RandomForest": RandomForest,
 }