First pass at adding type hints, see #69

alan-turing-institute · Feb 26, 2024 · 2ec3fb2 · 2ec3fb2
1 parent bdfc97b
commit 2ec3fb2
Show file tree

Hide file tree

Showing 29 changed files with 1,453 additions and 314 deletions.
diff --git a/autoemulate/compare.py b/autoemulate/compare.py
@@ -1,9 +1,12 @@
+from typing import Optional
+
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from sklearn.decomposition import PCA
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
+from sklearn.model_selection import KFold
 from sklearn.model_selection import PredefinedSplit
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
@@ -36,22 +39,22 @@ def __init__(self):
 
     def setup(
         self,
-        X,
-        y,
-        param_search=False,
-        param_search_type="random",
-        param_search_iters=20,
-        test_set_size=0.2,
-        scale=True,
-        scaler=StandardScaler(),
-        reduce_dim=False,
-        dim_reducer=PCA(),
-        fold_strategy="kfold",
-        folds=5,
-        n_jobs=None,
-        model_subset=None,
-        log_to_file=False,
-    ):
+        X: np.ndarray,
+        y: np.ndarray,
+        param_search: bool = False,
+        param_search_type: str = "random",
+        param_search_iters: int = 20,
+        test_set_size: float = 0.2,
+        scale: bool = True,
+        scaler: StandardScaler = StandardScaler(),
+        reduce_dim: bool = False,
+        dim_reducer: PCA = PCA(),
+        fold_strategy: str = "kfold",
+        folds: int = 5,
+        n_jobs: Optional[int] = None,
+        model_subset: Optional[list] = None,
+        log_to_file: bool = False,
+    ) -> None:
         """Sets up the automatic emulation.
 
         Parameters
@@ -119,7 +122,9 @@ def setup(
         self.is_set_up = True
         self.cv_results = {}
 
-    def _check_input(self, X, y):
+    def _check_input(
+        self, X: np.ndarray, y: np.ndarray
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Checks and possibly converts the input data.
 
         Parameters
@@ -140,7 +145,7 @@ def _check_input(self, X, y):
         y = y.astype("float32")  # needed for pytorch models
         return X, y
 
-    def _get_metrics(self, METRIC_REGISTRY):
+    def _get_metrics(self, METRIC_REGISTRY: dict) -> list:
         """
         Get metrics from REGISTRY
 
@@ -156,7 +161,7 @@ def _get_metrics(self, METRIC_REGISTRY):
         """
         return [metric for metric in METRIC_REGISTRY.values()]
 
-    def _get_cv(self, CV_REGISTRY, fold_strategy, folds):
+    def _get_cv(self, CV_REGISTRY: dict, fold_strategy: str, folds: int) -> KFold:
         """Get cross-validation strategy from REGISTRY
 
         Parameters
@@ -175,7 +180,7 @@ def _get_cv(self, CV_REGISTRY, fold_strategy, folds):
         """
         return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True)
 
-    def compare(self):
+    def compare(self) -> pd.DataFrame:
         """Compares the emulator models on the data. self.setup() must be run first.
 
         Returns
@@ -190,7 +195,12 @@ def compare(self):
         self.scores_df = pd.DataFrame(
             columns=["model", "metric", "fold", "score"]
         ).astype(
-            {"model": "object", "metric": "object", "fold": "int64", "score": "float64"}
+            {
+                "model": "object",
+                "metric": "object",
+                "fold": "int64",
+                "score": "float64",
+            }
         )
 
         for i in range(len(self.models)):
@@ -246,7 +256,7 @@ def compare(self):
 
         return self.best_model
 
-    def get_model(self, rank=1, metric="r2"):
+    def get_model(self, rank: int = 1, metric: str = "r2"):  # TODO: add return type
         """Get a fitted model based on it's rank in the comparison.
 
         Parameters
@@ -284,7 +294,7 @@ def get_model(self, rank=1, metric="r2"):
 
         return chosen_model
 
-    def refit_model(self, model):
+    def refit_model(self, model):  # TODO: add model type
         """Refits a model on the full data.
 
         Parameters
@@ -303,8 +313,22 @@ def refit_model(self, model):
         model.fit(self.X, self.y)
         return model
 
-    def save_model(self, model=None, filepath=None):
-        """Saves the best model to disk."""
+    def save_model(
+        self, model=None, filepath: str = None
+    ) -> None:  # TODO add model type
+        """Saves the best model to disk.
+
+        Parameters
+        ----------
+        model : object
+            Fitted model.
+        filepath : str
+            Path to the model file.
+
+        Returns
+        -------
+        None
+        """
         if not hasattr(self, "best_model"):
             raise RuntimeError("Must run compare() before save_model()")
         serialiser = ModelSerialiser()
@@ -316,15 +340,27 @@ def save_model(self, model=None, filepath=None):
 
         serialiser.save_model(model, filepath)
 
-    def load_model(self, filepath=None):
-        """Loads a model from disk."""
+    def load_model(self, filepath: str = None):  # TODO add return type (model type)
+        """Loads a model from disk.
+
+        Parameters
+        ----------
+        filepath : str
+            Path to the model file.
+
+        Returns
+        -------
+        model : object
+            Loaded model.
+        """
         serialiser = ModelSerialiser()
         if filepath is None:
             raise ValueError("Filepath must be provided")
 
         return serialiser.load_model(filepath)
 
-    def print_results(self, model=None, sort_by="r2"):
+    # TODO for print_results: suggestion, rename model to model_name here to not confuse with other references to the model object
+    def print_results(self, model: Optional[str] = None, sort_by: str = "r2") -> None:
         """Print cv results.
 
         Parameters
@@ -342,13 +378,14 @@ def print_results(self, model=None, sort_by="r2"):
             sort_by=sort_by,
         )
 
+    # TODO for plot_results: suggestion, rename model to model_name here to not confuse with other references to the model object
     def plot_results(
         self,
-        model=None,
-        plot_type="actual_vs_predicted",
-        n_cols=3,
-        figsize=None,
-        output_index=0,
+        model: Optional[str] = None,
+        plot_type: str = "actual_vs_predicted",
+        n_cols: int = 3,
+        figsize: Optional[tuple] = None,
+        output_index: int = 0,
     ):
         """Plots the results of the cross-validation.
 
@@ -379,7 +416,7 @@ def plot_results(
             output_index=output_index,
         )
 
-    def evaluate_model(self, model=None):
+    def evaluate_model(self, model=None) -> pd.DataFrame:  # TODO add model type
         """
         Evaluates the model on the hold-out set.
 
@@ -411,7 +448,13 @@ def evaluate_model(self, model=None):
 
         return scores_df
 
-    def plot_model(self, model, plot="standard", n_cols=2, figsize=None):
+    def plot_model(
+        self,
+        model,
+        plot: str = "standard",
+        n_cols: int = 2,
+        figsize: Optional[tuple] = None,
+    ) -> None:  # TODO add model type
         """Plots the model predictions vs. the true values.
 
         Parameters
@@ -424,7 +467,14 @@ def plot_model(self, model, plot="standard", n_cols=2, figsize=None):
             “residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
         n_cols : int, optional
             Number of columns in the plot grid for multi-output. Default is 2.
+        figsize : tuple, optional
+            Overrides the default figure size.
         """
         _plot_model(
-            model, self.X[self.test_idxs], self.y[self.test_idxs], plot, n_cols, figsize
+            model,
+            self.X[self.test_idxs],
+            self.y[self.test_idxs],
+            plot,
+            n_cols,
+            figsize,
         )
diff --git a/autoemulate/cross_validate.py b/autoemulate/cross_validate.py
@@ -1,14 +1,74 @@
+from typing import TYPE_CHECKING
+
 import numpy as np
 import pandas as pd
 from sklearn.metrics import make_scorer
 from sklearn.model_selection import cross_validate
 from sklearn.model_selection import PredefinedSplit
 from sklearn.model_selection import train_test_split
 
+from autoemulate.types import ArrayLike
+from autoemulate.types import MatrixLike
+from autoemulate.types import Union
 from autoemulate.utils import get_model_name
 
+if TYPE_CHECKING:
+    from logging import Logger
+    from .types import Iterable
+    from sklearn.model_selection import BaseCrossValidator
+    from sklearn.model_selection import BaseShuffleSplit
+    from sklearn.pipeline import Pipeline
+
+
+def run_cv(
+    X: MatrixLike,
+    y: Union[MatrixLike, ArrayLike],
+    cv: Union[int, BaseCrossValidator, Iterable, BaseShuffleSplit],
+    model: Pipeline,  # TODO: Verify that this is correct
+    metrics: list,
+    n_jobs: int,
+    logger: Logger,
+):
+    """Runs cross-validation on a model.
+
+    Parameters
+    ----------
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
+        The data to fit. Can be for example a list, or an array.
+    y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
+        The target variable to try to predict in the case of supervised learning.
+    cv : int, cross-validation generator or an iterable, default=None
+        Determines the cross-validation splitting strategy.
+        Possible inputs for cv are:
+
+        - None, to use the default 5-fold cross validation,
+        - int, to specify the number of folds in a `(Stratified)KFold`,
+        - CV splitter,
+        - An iterable yielding (train, test) splits as arrays of indices.
+
+        For int/None inputs, if the estimator is a classifier and ``y`` is
+        either binary or multiclass, :class:`StratifiedKFold` is used. In all
+        other cases, :class:`KFold` is used. These splitters are instantiated
+        with `shuffle=False` so the splits will be the same across calls.
+
+        Refer :ref:`User Guide <cross_validation>` for the various
+        cross-validation strategies that can be used here.
+    model : sklearn.pipeline.Pipeline
+        Model to cross-validate.
+    metrics : list
+        List of metrics to use for cross-validation.
+    n_jobs : int
+        Number of jobs to run in parallel.
+    logger : logging.Logger
+        Logger object.
 
-def run_cv(X, y, cv, model, metrics, n_jobs, logger):
+    Returns
+    -------
+    fitted_model : sklearn.pipeline.Pipeline
+        Fitted model.
+    cv_results : dict
+        Results of the cross-validation.
+    """
     model_name = get_model_name(model)
 
     # The metrics we want to use for cross-validation
@@ -39,22 +99,23 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
     return fitted_model, cv_results
 
 
-def update_scores_df(scores_df, model, cv_results):
+# TODO for update_scores_df: suggestion, rename model to model_name here to not confuse with other references to the model object
+def update_scores_df(scores_df: pd.DataFrame, model: str, cv_results: dict) -> None:
     """Updates the scores dataframe with the results of the cross-validation.
 
     Parameters
     ----------
-        scores_df : pandas.DataFrame
-            DataFrame with columns "model", "metric", "fold", "score".
-        model_name : str
-            Name of the model.
-        cv_results : dict
-            Results of the cross-validation.
+    scores_df : pandas.DataFrame
+        DataFrame with columns "model", "metric", "fold", "score".
+    model_name : str
+        Name of the model.
+    cv_results : dict
+        Results of the cross-validation.
 
     Returns
     -------
-        None
-            Modifies the self.scores_df DataFrame in-place.
+    None
+        Modifies the self.scores_df DataFrame in-place.
 
     """
     # Gather scores from each metric

diff --git a/autoemulate/cv.py b/autoemulate/cv.py
@@ -1,21 +1,24 @@
 from sklearn.model_selection import KFold
 from sklearn.model_selection import StratifiedKFold
 
+from .types import Optional
 
-def kfold(folds=None, shuffle=True):
+
+# TODO: KFold seems to only accept a n_splits parameter of Int type so we should also enforce that here to avoid bugs
+def kfold(folds: Optional[int] = None, shuffle: bool = True) -> KFold:
     """scikit-learn class for k-fold cross validation.
 
     Parameters
     ----------
     folds : int
-            Number of folds.
+        Number of folds. Must be at least 2.
     shuffle : bool
-            Whether or not to shuffle the data before splitting.
+        Whether or not to shuffle the data before splitting.
 
     Returns
     -------
     kfold : sklearn.model_selection.KFold
-            An instance of the KFold class.
+        An instance of the KFold class.
     """
     return KFold(n_splits=folds, shuffle=shuffle)
 

diff --git a/autoemulate/data_splitting.py b/autoemulate/data_splitting.py
@@ -1,8 +1,13 @@
 import numpy as np
 from sklearn.model_selection import train_test_split
 
+from .types import ArrayLike
+from .types import Optional
 
-def split_data(X, test_size=0.2, random_state=None):
+
+def split_data(
+    X: ArrayLike, test_size: float = 0.2, random_state: Optional[int] = None
+) -> tuple[ArrayLike, ArrayLike]:
     """Splits the data into training and testing sets.
 
     Parameters