ENH: add estimator module

scikit-learn-contrib · Jul 7, 2023 · 1dbf7c6 · 1dbf7c6
1 parent 90eb152
commit 1dbf7c6
Show file tree

Hide file tree

Showing 6 changed files with 348 additions and 7 deletions.
diff --git a/mapie/conformity_scores/conformity_scores.py b/mapie/conformity_scores/conformity_scores.py
@@ -2,10 +2,10 @@
 
 import numpy as np
 from typing import Tuple
-from sklearn.base import RegressorMixin
 
 from mapie._compatibility import np_nanquantile
 from mapie._typing import ArrayLike, NDArray
+from mapie.estimator.interface import EnsembleEstimator
 
 
 class ConformityScore(metaclass=ABCMeta):
@@ -257,22 +257,22 @@ def get_quantile(
  def get_bounds(
  self,
  X: ArrayLike,
- estimator: RegressorMixin,
+ estimator: EnsembleEstimator,
  conformity_scores: NDArray,
  alpha_np: NDArray,
  ensemble: bool,
  method: str
  ) -> Tuple[NDArray, NDArray, NDArray]:
  """
  Compute bounds of the prediction intervals from the observed values,
- the estimator of type ``EnsembleRegressor`` and the conformity scores.
+ the estimator of type ``EnsembleEstimator`` and the conformity scores.
 
  Parameters
  ----------
  X: ArrayLike of shape (n_samples_test, n_features)
  Observed feature values.
 
- estimator: RegressorMixin
+ estimator: EnsembleEstimator
  Estimator that is fitted to predict y from X.
 
  conformity_scores: ArrayLike of shape (n_samples_calib,)

diff --git a/mapie/estimator/__init__.py b/mapie/estimator/__init__.py
diff --git a/mapie/regression/estimator.py → mapie/estimator/estimator.py b/mapie/regression/estimator.py → mapie/estimator/estimator.py
@@ -13,9 +13,10 @@
 from mapie.aggregation_functions import aggregate_all, phi2D
 from mapie.utils import (check_nan_in_aposteriori_prediction,
  fit_estimator)
+from mapie.estimator.interface import EnsembleEstimator
 
 
-class EnsembleRegressor(RegressorMixin):
+class EnsembleRegressor(EnsembleEstimator):
  """
  This class implements methods to handle the training and usage of the
  estimator. This estimator can be unique or composed by cross validated

diff --git a/mapie/estimator/interface.py b/mapie/estimator/interface.py
@@ -0,0 +1,340 @@
+from __future__ import annotations
+
+from typing import Optional, Tuple, Union
+
+from sklearn.base import RegressorMixin
+
+from mapie._typing import ArrayLike, NDArray
+
+
+class EnsembleEstimator(RegressorMixin):
+ """
+ This class implements methods to handle the training and usage of the
+ estimator. This estimator can be unique or composed by cross validated
+ estimators.
+
+ Parameters
+ ----------
+ estimator: Optional[RegressorMixin]
+ Any regressor with scikit-learn API
+ (i.e. with ``fit`` and ``predict`` methods).
+ If ``None``, estimator defaults to a ``LinearRegression`` instance.
+
+ By default ``None``.
+
+ method: str
+ Method to choose for prediction interval estimates.
+ Choose among:
+
+ - ``"naive"``, based on training set conformity scores,
+ - ``"base"``, based on validation sets conformity scores,
+ - ``"plus"``, based on validation conformity scores and
+ testing predictions,
+ - ``"minmax"``, based on validation conformity scores and
+ testing predictions (min/max among cross-validation clones).
+
+ By default ``"plus"``.
+
+ cv: Optional[Union[int, str, BaseCrossValidator]]
+ The cross-validation strategy for computing conformity scores.
+ It directly drives the distinction between jackknife and cv variants.
+ Choose among:
+
+ - ``None``, to use the default 5-fold cross-validation
+ - integer, to specify the number of folds.
+ If equal to ``-1``, equivalent to
+ ``sklearn.model_selection.LeaveOneOut()``.
+ - CV splitter: any ``sklearn.model_selection.BaseCrossValidator``
+ Main variants are:
+ - ``sklearn.model_selection.LeaveOneOut`` (jackknife),
+ - ``sklearn.model_selection.KFold`` (cross-validation),
+ - ``subsample.Subsample`` object (bootstrap).
+ - ``"split"``, does not involve cross-validation but a division
+ of the data into training and calibration subsets. The splitter
+ used is the following: ``sklearn.model_selection.ShuffleSplit``.
+ - ``"prefit"``, assumes that ``estimator`` has been fitted already,
+ and the ``method`` parameter is ignored.
+ All data provided in the ``fit`` method is then used
+ for computing conformity scores only.
+ At prediction time, quantiles of these conformity scores are used
+ to provide a prediction interval with fixed width.
+ The user has to take care manually that data for model fitting and
+ conformity scores estimate are disjoint.
+
+ By default ``None``.
+
+ test_size: Optional[Union[int, float]]
+ If ``float``, should be between ``0.0`` and ``1.0`` and represent the
+ proportion of the dataset to include in the test split. If ``int``,
+ represents the absolute number of test samples. If ``None``,
+ it will be set to ``0.1``.
+
+ If cv is not ``"split"``, ``test_size`` is ignored.
+
+ By default ``None``.
+
+ n_jobs: Optional[int]
+ Number of jobs for parallel processing using joblib
+ via the "locky" backend.
+ If ``-1`` all CPUs are used.
+ If ``1`` is given, no parallel computing code is used at all,
+ which is useful for debugging.
+ For ``n_jobs`` below ``-1``, ``(n_cpus + 1 - n_jobs)`` are used.
+ ``None`` is a marker for `unset` that will be interpreted as
+ ``n_jobs=1`` (sequential execution).
+
+ By default ``None``.
+
+ agg_function: Optional[str]
+ Determines how to aggregate predictions from perturbed models, both at
+ training and prediction time.
+
+ If ``None``, it is ignored except if ``cv`` class is ``Subsample``,
+ in which case an error is raised.
+ If ``"mean"`` or ``"median"``, returns the mean or median of the
+ predictions computed from the out-of-folds models.
+ Note: if you plan to set the ``ensemble`` argument to ``True`` in the
+ ``predict`` method, you have to specify an aggregation function.
+ Otherwise an error would be raised.
+
+ The Jackknife+ interval can be interpreted as an interval around the
+ median prediction, and is guaranteed to lie inside the interval,
+ unlike the single estimator predictions.
+
+ When the cross-validation strategy is ``Subsample`` (i.e. for the
+ Jackknife+-after-Bootstrap method), this function is also used to
+ aggregate the training set in-sample predictions.
+
+ If ``cv`` is ``"prefit"`` or ``"split"``, ``agg_function`` is ignored.
+
+ By default ``"mean"``.
+
+ verbose: int
+ The verbosity level, used with joblib for multiprocessing.
+ The frequency of the messages increases with the verbosity level.
+ If it more than ``10``, all iterations are reported.
+ Above ``50``, the output is sent to stdout.
+
+ By default ``0``.
+
+ random_state: Optional[Union[int, RandomState]]
+ Pseudo random number generator state used for random sampling.
+ Pass an int for reproducible output across multiple function calls.
+
+ By default ``None``.
+
+ Attributes
+ ----------
+ single_estimator_: sklearn.RegressorMixin
+ Estimator fitted on the whole training set.
+
+ estimators_: list
+ List of out-of-folds estimators.
+
+ k_: ArrayLike
+ - Array of nans, of shape (len(y), 1) if ``cv`` is ``"prefit"``
+ (defined but not used)
+ - Dummy array of folds containing each training sample, otherwise.
+ Of shape (n_samples_train, cv.get_n_splits(X_train, y_train)).
+ """
+ no_agg_cv_ = ["prefit", "split"]
+ no_agg_methods_ = ["naive", "base"]
+ fit_attributes = [
+ "single_estimator_",
+ "estimators_",
+ "k_",
+ ]
+
+ @staticmethod
+ def _fit_oof_estimator(
+ estimator: RegressorMixin,
+ X: ArrayLike,
+ y: ArrayLike,
+ train_index: ArrayLike,
+ sample_weight: Optional[ArrayLike] = None,
+ ) -> RegressorMixin:
+ """
+ Fit a single out-of-fold model on a given training set.
+
+ Parameters
+ ----------
+ estimator: RegressorMixin
+ Estimator to train.
+
+ X: ArrayLike of shape (n_samples, n_features)
+ Input data.
+
+ y: ArrayLike of shape (n_samples,)
+ Input labels.
+
+ train_index: ArrayLike of shape (n_samples_train)
+ Training data indices.
+
+ sample_weight: Optional[ArrayLike] of shape (n_samples,)
+ Sample weights. If None, then samples are equally weighted.
+ By default ``None``.
+
+ Returns
+ -------
+ Tuple[RegressorMixin, NDArray, ArrayLike]
+
+ - [0]: RegressorMixin, fitted estimator
+ - [1]: NDArray of shape (n_samples_val,),
+ estimator predictions on the validation fold.
+ - [2]: ArrayLike of shape (n_samples_val,),
+ validation data indices.
+ """
+
+ @staticmethod
+ def _predict_oof_estimator(
+ estimator: RegressorMixin,
+ X: ArrayLike,
+ val_index: ArrayLike,
+ ):
+ """
+ Perform predictions on a single out-of-fold model on a validation set.
+
+ Parameters
+ ----------
+ estimator: RegressorMixin
+ Estimator to train.
+
+ X: ArrayLike of shape (n_samples, n_features)
+ Input data.
+
+ val_index: ArrayLike of shape (n_samples_val)
+ Validation data indices.
+
+ Returns
+ -------
+ Tuple[NDArray, ArrayLike]
+ Predictions of estimator from val_index of X.
+ """
+
+ def _aggregate_with_mask(
+ self,
+ x: NDArray,
+ k: NDArray
+ ) -> NDArray:
+ """
+ Take the array of predictions, made by the refitted estimators,
+ on the testing set, and the 1-or-nan array indicating for each training
+ sample which one to integrate, and aggregate to produce phi-{t}(x_t)
+ for each training sample x_t.
+
+ Parameters
+ ----------
+ x: ArrayLike of shape (n_samples_test, n_estimators)
+ Array of predictions, made by the refitted estimators,
+ for each sample of the testing set.
+
+ k: ArrayLike of shape (n_samples_training, n_estimators)
+ 1-or-nan array: indicates whether to integrate the prediction
+ of a given estimator into the aggregation, for each training
+ sample.
+
+ Returns
+ -------
+ ArrayLike of shape (n_samples_test,)
+ Array of aggregated predictions for each testing sample.
+ """
+
+ def _pred_multi(self, X: ArrayLike) -> NDArray:
+ """
+ Return a prediction per train sample for each test sample, by
+ aggregation with matrix ``k_``.
+
+ Parameters
+ ----------
+ X: ArrayLike of shape (n_samples_test, n_features)
+ Input data
+
+ Returns
+ -------
+ NDArray of shape (n_samples_test, n_samples_train)
+ """
+
+ def predict_calib(self, X: ArrayLike) -> NDArray:
+ """
+ Perform predictions on X : the calibration set. This method is
+ called in the ConformityScore class to compute the conformity scores.
+
+ Parameters
+ ----------
+ X: ArrayLike of shape (n_samples_test, n_features)
+ Input data
+
+ Returns
+ -------
+ NDArray of shape (n_samples_test, 1)
+ The predictions.
+ """
+
+ def fit(
+ self,
+ X: ArrayLike,
+ y: ArrayLike,
+ sample_weight: Optional[ArrayLike] = None,
+ ) -> EnsembleEstimator:
+ """
+ Fit the base estimator under the ``single_estimator_`` attribute.
+ Fit all cross-validated estimator clones
+ and rearrange them into a list, the ``estimators_`` attribute.
+ Out-of-fold conformity scores are stored under
+ the ``conformity_scores_`` attribute.
+
+ Parameters
+ ----------
+ X: ArrayLike of shape (n_samples, n_features)
+ Input data.
+
+ y: ArrayLike of shape (n_samples,)
+ Input labels.
+
+ sample_weight: Optional[ArrayLike] of shape (n_samples,)
+ Sample weights. If None, then samples are equally weighted.
+ By default ``None``.
+
+ Returns
+ -------
+ EnsembleRegressor
+ The estimator fitted.
+ """
+
+ def predict(
+ self,
+ X: ArrayLike,
+ ensemble: bool = False,
+ return_multi_pred: bool = True
+ ) -> Union[NDArray, Tuple[NDArray, NDArray, NDArray]]:
+ """
+ Predict target from X. It also computes the prediction per train sample
+ for each test sample according to ``self.method``.
+
+ Parameters
+ ----------
+ X: ArrayLike of shape (n_samples, n_features)
+ Test data.
+
+ ensemble: bool
+ Boolean determining whether the predictions are ensembled or not.
+ If ``False``, predictions are those of the model trained on the
+ whole training set.
+ If ``True``, predictions from perturbed models are aggregated by
+ the aggregation function specified in the ``agg_function``
+ attribute.
+
+ If ``cv`` is ``"prefit"`` or ``"split"``, ``ensemble`` is ignored.
+
+ By default ``False``.
+
+ return_multi_pred: bool
+
+
+ Returns
+ -------
+ Tuple[NDArray, NDArray, NDArray]
+ - Predictions
+ - The multiple predictions for the lower bound of the intervals.
+ - The multiple predictions for the upper bound of the intervals.
+ """
diff --git a/mapie/regression/regression.py b/mapie/regression/regression.py
@@ -13,7 +13,7 @@
 
 from mapie._typing import ArrayLike, NDArray
 from mapie.conformity_scores import ConformityScore
-from .estimator import EnsembleRegressor
+from mapie.estimator.estimator import EnsembleRegressor
 from mapie.utils import (check_alpha, check_alpha_and_n_samples,
  check_conformity_score, check_cv,
  check_estimator_fit_predict, check_n_features_in,