Skip to content

Commit

Permalink
First pass at adding type hints, see #69
Browse files Browse the repository at this point in the history
  • Loading branch information
kallewesterling committed Feb 26, 2024
1 parent bdfc97b commit 2ec3fb2
Show file tree
Hide file tree
Showing 29 changed files with 1,453 additions and 314 deletions.
122 changes: 86 additions & 36 deletions autoemulate/compare.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from typing import Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
Expand Down Expand Up @@ -36,22 +39,22 @@ def __init__(self):

def setup(
self,
X,
y,
param_search=False,
param_search_type="random",
param_search_iters=20,
test_set_size=0.2,
scale=True,
scaler=StandardScaler(),
reduce_dim=False,
dim_reducer=PCA(),
fold_strategy="kfold",
folds=5,
n_jobs=None,
model_subset=None,
log_to_file=False,
):
X: np.ndarray,
y: np.ndarray,
param_search: bool = False,
param_search_type: str = "random",
param_search_iters: int = 20,
test_set_size: float = 0.2,
scale: bool = True,
scaler: StandardScaler = StandardScaler(),
reduce_dim: bool = False,
dim_reducer: PCA = PCA(),
fold_strategy: str = "kfold",
folds: int = 5,
n_jobs: Optional[int] = None,
model_subset: Optional[list] = None,
log_to_file: bool = False,
) -> None:
"""Sets up the automatic emulation.
Parameters
Expand Down Expand Up @@ -119,7 +122,9 @@ def setup(
self.is_set_up = True
self.cv_results = {}

def _check_input(self, X, y):
def _check_input(
self, X: np.ndarray, y: np.ndarray
) -> tuple[np.ndarray, np.ndarray]:
"""Checks and possibly converts the input data.
Parameters
Expand All @@ -140,7 +145,7 @@ def _check_input(self, X, y):
y = y.astype("float32") # needed for pytorch models
return X, y

def _get_metrics(self, METRIC_REGISTRY):
def _get_metrics(self, METRIC_REGISTRY: dict) -> list:
"""
Get metrics from REGISTRY
Expand All @@ -156,7 +161,7 @@ def _get_metrics(self, METRIC_REGISTRY):
"""
return [metric for metric in METRIC_REGISTRY.values()]

def _get_cv(self, CV_REGISTRY, fold_strategy, folds):
def _get_cv(self, CV_REGISTRY: dict, fold_strategy: str, folds: int) -> KFold:
"""Get cross-validation strategy from REGISTRY
Parameters
Expand All @@ -175,7 +180,7 @@ def _get_cv(self, CV_REGISTRY, fold_strategy, folds):
"""
return CV_REGISTRY[fold_strategy](folds=folds, shuffle=True)

def compare(self):
def compare(self) -> pd.DataFrame:
"""Compares the emulator models on the data. self.setup() must be run first.
Returns
Expand All @@ -190,7 +195,12 @@ def compare(self):
self.scores_df = pd.DataFrame(
columns=["model", "metric", "fold", "score"]
).astype(
{"model": "object", "metric": "object", "fold": "int64", "score": "float64"}
{
"model": "object",
"metric": "object",
"fold": "int64",
"score": "float64",
}
)

for i in range(len(self.models)):
Expand Down Expand Up @@ -246,7 +256,7 @@ def compare(self):

return self.best_model

def get_model(self, rank=1, metric="r2"):
def get_model(self, rank: int = 1, metric: str = "r2"): # TODO: add return type
"""Get a fitted model based on it's rank in the comparison.
Parameters
Expand Down Expand Up @@ -284,7 +294,7 @@ def get_model(self, rank=1, metric="r2"):

return chosen_model

def refit_model(self, model):
def refit_model(self, model): # TODO: add model type
"""Refits a model on the full data.
Parameters
Expand All @@ -303,8 +313,22 @@ def refit_model(self, model):
model.fit(self.X, self.y)
return model

def save_model(self, model=None, filepath=None):
"""Saves the best model to disk."""
def save_model(
self, model=None, filepath: str = None
) -> None: # TODO add model type
"""Saves the best model to disk.
Parameters
----------
model : object
Fitted model.
filepath : str
Path to the model file.
Returns
-------
None
"""
if not hasattr(self, "best_model"):
raise RuntimeError("Must run compare() before save_model()")
serialiser = ModelSerialiser()
Expand All @@ -316,15 +340,27 @@ def save_model(self, model=None, filepath=None):

serialiser.save_model(model, filepath)

def load_model(self, filepath=None):
"""Loads a model from disk."""
def load_model(self, filepath: str = None): # TODO add return type (model type)
"""Loads a model from disk.
Parameters
----------
filepath : str
Path to the model file.
Returns
-------
model : object
Loaded model.
"""
serialiser = ModelSerialiser()
if filepath is None:
raise ValueError("Filepath must be provided")

return serialiser.load_model(filepath)

def print_results(self, model=None, sort_by="r2"):
# TODO for print_results: suggestion, rename model to model_name here to not confuse with other references to the model object
def print_results(self, model: Optional[str] = None, sort_by: str = "r2") -> None:
"""Print cv results.
Parameters
Expand All @@ -342,13 +378,14 @@ def print_results(self, model=None, sort_by="r2"):
sort_by=sort_by,
)

# TODO for plot_results: suggestion, rename model to model_name here to not confuse with other references to the model object
def plot_results(
self,
model=None,
plot_type="actual_vs_predicted",
n_cols=3,
figsize=None,
output_index=0,
model: Optional[str] = None,
plot_type: str = "actual_vs_predicted",
n_cols: int = 3,
figsize: Optional[tuple] = None,
output_index: int = 0,
):
"""Plots the results of the cross-validation.
Expand Down Expand Up @@ -379,7 +416,7 @@ def plot_results(
output_index=output_index,
)

def evaluate_model(self, model=None):
def evaluate_model(self, model=None) -> pd.DataFrame: # TODO add model type
"""
Evaluates the model on the hold-out set.
Expand Down Expand Up @@ -411,7 +448,13 @@ def evaluate_model(self, model=None):

return scores_df

def plot_model(self, model, plot="standard", n_cols=2, figsize=None):
def plot_model(
self,
model,
plot: str = "standard",
n_cols: int = 2,
figsize: Optional[tuple] = None,
) -> None: # TODO add model type
"""Plots the model predictions vs. the true values.
Parameters
Expand All @@ -424,7 +467,14 @@ def plot_model(self, model, plot="standard", n_cols=2, figsize=None):
“residual” draws the residuals, i.e. difference between observed and predicted values, (y-axis) vs. the predicted values (x-axis).
n_cols : int, optional
Number of columns in the plot grid for multi-output. Default is 2.
figsize : tuple, optional
Overrides the default figure size.
"""
_plot_model(
model, self.X[self.test_idxs], self.y[self.test_idxs], plot, n_cols, figsize
model,
self.X[self.test_idxs],
self.y[self.test_idxs],
plot,
n_cols,
figsize,
)
81 changes: 71 additions & 10 deletions autoemulate/cross_validate.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,74 @@
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split

from autoemulate.types import ArrayLike
from autoemulate.types import MatrixLike
from autoemulate.types import Union
from autoemulate.utils import get_model_name

if TYPE_CHECKING:
from logging import Logger
from .types import Iterable
from sklearn.model_selection import BaseCrossValidator
from sklearn.model_selection import BaseShuffleSplit
from sklearn.pipeline import Pipeline


def run_cv(
X: MatrixLike,
y: Union[MatrixLike, ArrayLike],
cv: Union[int, BaseCrossValidator, Iterable, BaseShuffleSplit],
model: Pipeline, # TODO: Verify that this is correct
metrics: list,
n_jobs: int,
logger: Logger,
):
"""Runs cross-validation on a model.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
The data to fit. Can be for example a list, or an array.
y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
The target variable to try to predict in the case of supervised learning.
cv : int, cross-validation generator or an iterable, default=None
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 5-fold cross validation,
- int, to specify the number of folds in a `(Stratified)KFold`,
- CV splitter,
- An iterable yielding (train, test) splits as arrays of indices.
For int/None inputs, if the estimator is a classifier and ``y`` is
either binary or multiclass, :class:`StratifiedKFold` is used. In all
other cases, :class:`KFold` is used. These splitters are instantiated
with `shuffle=False` so the splits will be the same across calls.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validation strategies that can be used here.
model : sklearn.pipeline.Pipeline
Model to cross-validate.
metrics : list
List of metrics to use for cross-validation.
n_jobs : int
Number of jobs to run in parallel.
logger : logging.Logger
Logger object.
def run_cv(X, y, cv, model, metrics, n_jobs, logger):
Returns
-------
fitted_model : sklearn.pipeline.Pipeline
Fitted model.
cv_results : dict
Results of the cross-validation.
"""
model_name = get_model_name(model)

# The metrics we want to use for cross-validation
Expand Down Expand Up @@ -39,22 +99,23 @@ def run_cv(X, y, cv, model, metrics, n_jobs, logger):
return fitted_model, cv_results


def update_scores_df(scores_df, model, cv_results):
# TODO for update_scores_df: suggestion, rename model to model_name here to not confuse with other references to the model object
def update_scores_df(scores_df: pd.DataFrame, model: str, cv_results: dict) -> None:
"""Updates the scores dataframe with the results of the cross-validation.
Parameters
----------
scores_df : pandas.DataFrame
DataFrame with columns "model", "metric", "fold", "score".
model_name : str
Name of the model.
cv_results : dict
Results of the cross-validation.
scores_df : pandas.DataFrame
DataFrame with columns "model", "metric", "fold", "score".
model_name : str
Name of the model.
cv_results : dict
Results of the cross-validation.
Returns
-------
None
Modifies the self.scores_df DataFrame in-place.
None
Modifies the self.scores_df DataFrame in-place.
"""
# Gather scores from each metric
Expand Down
11 changes: 7 additions & 4 deletions autoemulate/cv.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from .types import Optional

def kfold(folds=None, shuffle=True):

# TODO: KFold seems to only accept a n_splits parameter of Int type so we should also enforce that here to avoid bugs
def kfold(folds: Optional[int] = None, shuffle: bool = True) -> KFold:
"""scikit-learn class for k-fold cross validation.
Parameters
----------
folds : int
Number of folds.
Number of folds. Must be at least 2.
shuffle : bool
Whether or not to shuffle the data before splitting.
Whether or not to shuffle the data before splitting.
Returns
-------
kfold : sklearn.model_selection.KFold
An instance of the KFold class.
An instance of the KFold class.
"""
return KFold(n_splits=folds, shuffle=shuffle)

Expand Down
7 changes: 6 additions & 1 deletion autoemulate/data_splitting.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import numpy as np
from sklearn.model_selection import train_test_split

from .types import ArrayLike
from .types import Optional

def split_data(X, test_size=0.2, random_state=None):

def split_data(
X: ArrayLike, test_size: float = 0.2, random_state: Optional[int] = None
) -> tuple[ArrayLike, ArrayLike]:
"""Splits the data into training and testing sets.
Parameters
Expand Down
Loading

0 comments on commit 2ec3fb2

Please sign in to comment.