Skip to content

Commit

Permalink
ENH: add estimator module
Browse files Browse the repository at this point in the history
  • Loading branch information
Candice Moyet committed Jul 7, 2023
1 parent 90eb152 commit 1dbf7c6
Show file tree
Hide file tree
Showing 6 changed files with 348 additions and 7 deletions.
8 changes: 4 additions & 4 deletions mapie/conformity_scores/conformity_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@

import numpy as np
from typing import Tuple
from sklearn.base import RegressorMixin

from mapie._compatibility import np_nanquantile
from mapie._typing import ArrayLike, NDArray
from mapie.estimator.interface import EnsembleEstimator


class ConformityScore(metaclass=ABCMeta):
Expand Down Expand Up @@ -257,22 +257,22 @@ def get_quantile(
def get_bounds(
self,
X: ArrayLike,
estimator: RegressorMixin,
estimator: EnsembleEstimator,
conformity_scores: NDArray,
alpha_np: NDArray,
ensemble: bool,
method: str
) -> Tuple[NDArray, NDArray, NDArray]:
"""
Compute bounds of the prediction intervals from the observed values,
the estimator of type ``EnsembleRegressor`` and the conformity scores.
the estimator of type ``EnsembleEstimator`` and the conformity scores.
Parameters
----------
X: ArrayLike of shape (n_samples_test, n_features)
Observed feature values.
estimator: RegressorMixin
estimator: EnsembleEstimator
Estimator that is fitted to predict y from X.
conformity_scores: ArrayLike of shape (n_samples_calib,)
Expand Down
Empty file added mapie/estimator/__init__.py
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
from mapie.aggregation_functions import aggregate_all, phi2D
from mapie.utils import (check_nan_in_aposteriori_prediction,
fit_estimator)
from mapie.estimator.interface import EnsembleEstimator


class EnsembleRegressor(RegressorMixin):
class EnsembleRegressor(EnsembleEstimator):
"""
This class implements methods to handle the training and usage of the
estimator. This estimator can be unique or composed by cross validated
Expand Down
340 changes: 340 additions & 0 deletions mapie/estimator/interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
from __future__ import annotations

from typing import Optional, Tuple, Union

from sklearn.base import RegressorMixin

from mapie._typing import ArrayLike, NDArray


class EnsembleEstimator(RegressorMixin):
"""
This class implements methods to handle the training and usage of the
estimator. This estimator can be unique or composed by cross validated
estimators.
Parameters
----------
estimator: Optional[RegressorMixin]
Any regressor with scikit-learn API
(i.e. with ``fit`` and ``predict`` methods).
If ``None``, estimator defaults to a ``LinearRegression`` instance.
By default ``None``.
method: str
Method to choose for prediction interval estimates.
Choose among:
- ``"naive"``, based on training set conformity scores,
- ``"base"``, based on validation sets conformity scores,
- ``"plus"``, based on validation conformity scores and
testing predictions,
- ``"minmax"``, based on validation conformity scores and
testing predictions (min/max among cross-validation clones).
By default ``"plus"``.
cv: Optional[Union[int, str, BaseCrossValidator]]
The cross-validation strategy for computing conformity scores.
It directly drives the distinction between jackknife and cv variants.
Choose among:
- ``None``, to use the default 5-fold cross-validation
- integer, to specify the number of folds.
If equal to ``-1``, equivalent to
``sklearn.model_selection.LeaveOneOut()``.
- CV splitter: any ``sklearn.model_selection.BaseCrossValidator``
Main variants are:
- ``sklearn.model_selection.LeaveOneOut`` (jackknife),
- ``sklearn.model_selection.KFold`` (cross-validation),
- ``subsample.Subsample`` object (bootstrap).
- ``"split"``, does not involve cross-validation but a division
of the data into training and calibration subsets. The splitter
used is the following: ``sklearn.model_selection.ShuffleSplit``.
- ``"prefit"``, assumes that ``estimator`` has been fitted already,
and the ``method`` parameter is ignored.
All data provided in the ``fit`` method is then used
for computing conformity scores only.
At prediction time, quantiles of these conformity scores are used
to provide a prediction interval with fixed width.
The user has to take care manually that data for model fitting and
conformity scores estimate are disjoint.
By default ``None``.
test_size: Optional[Union[int, float]]
If ``float``, should be between ``0.0`` and ``1.0`` and represent the
proportion of the dataset to include in the test split. If ``int``,
represents the absolute number of test samples. If ``None``,
it will be set to ``0.1``.
If cv is not ``"split"``, ``test_size`` is ignored.
By default ``None``.
n_jobs: Optional[int]
Number of jobs for parallel processing using joblib
via the "locky" backend.
If ``-1`` all CPUs are used.
If ``1`` is given, no parallel computing code is used at all,
which is useful for debugging.
For ``n_jobs`` below ``-1``, ``(n_cpus + 1 - n_jobs)`` are used.
``None`` is a marker for `unset` that will be interpreted as
``n_jobs=1`` (sequential execution).
By default ``None``.
agg_function: Optional[str]
Determines how to aggregate predictions from perturbed models, both at
training and prediction time.
If ``None``, it is ignored except if ``cv`` class is ``Subsample``,
in which case an error is raised.
If ``"mean"`` or ``"median"``, returns the mean or median of the
predictions computed from the out-of-folds models.
Note: if you plan to set the ``ensemble`` argument to ``True`` in the
``predict`` method, you have to specify an aggregation function.
Otherwise an error would be raised.
The Jackknife+ interval can be interpreted as an interval around the
median prediction, and is guaranteed to lie inside the interval,
unlike the single estimator predictions.
When the cross-validation strategy is ``Subsample`` (i.e. for the
Jackknife+-after-Bootstrap method), this function is also used to
aggregate the training set in-sample predictions.
If ``cv`` is ``"prefit"`` or ``"split"``, ``agg_function`` is ignored.
By default ``"mean"``.
verbose: int
The verbosity level, used with joblib for multiprocessing.
The frequency of the messages increases with the verbosity level.
If it more than ``10``, all iterations are reported.
Above ``50``, the output is sent to stdout.
By default ``0``.
random_state: Optional[Union[int, RandomState]]
Pseudo random number generator state used for random sampling.
Pass an int for reproducible output across multiple function calls.
By default ``None``.
Attributes
----------
single_estimator_: sklearn.RegressorMixin
Estimator fitted on the whole training set.
estimators_: list
List of out-of-folds estimators.
k_: ArrayLike
- Array of nans, of shape (len(y), 1) if ``cv`` is ``"prefit"``
(defined but not used)
- Dummy array of folds containing each training sample, otherwise.
Of shape (n_samples_train, cv.get_n_splits(X_train, y_train)).
"""
no_agg_cv_ = ["prefit", "split"]
no_agg_methods_ = ["naive", "base"]
fit_attributes = [
"single_estimator_",
"estimators_",
"k_",
]

@staticmethod
def _fit_oof_estimator(
estimator: RegressorMixin,
X: ArrayLike,
y: ArrayLike,
train_index: ArrayLike,
sample_weight: Optional[ArrayLike] = None,
) -> RegressorMixin:
"""
Fit a single out-of-fold model on a given training set.
Parameters
----------
estimator: RegressorMixin
Estimator to train.
X: ArrayLike of shape (n_samples, n_features)
Input data.
y: ArrayLike of shape (n_samples,)
Input labels.
train_index: ArrayLike of shape (n_samples_train)
Training data indices.
sample_weight: Optional[ArrayLike] of shape (n_samples,)
Sample weights. If None, then samples are equally weighted.
By default ``None``.
Returns
-------
Tuple[RegressorMixin, NDArray, ArrayLike]
- [0]: RegressorMixin, fitted estimator
- [1]: NDArray of shape (n_samples_val,),
estimator predictions on the validation fold.
- [2]: ArrayLike of shape (n_samples_val,),
validation data indices.
"""

@staticmethod
def _predict_oof_estimator(
estimator: RegressorMixin,
X: ArrayLike,
val_index: ArrayLike,
):
"""
Perform predictions on a single out-of-fold model on a validation set.
Parameters
----------
estimator: RegressorMixin
Estimator to train.
X: ArrayLike of shape (n_samples, n_features)
Input data.
val_index: ArrayLike of shape (n_samples_val)
Validation data indices.
Returns
-------
Tuple[NDArray, ArrayLike]
Predictions of estimator from val_index of X.
"""

def _aggregate_with_mask(
self,
x: NDArray,
k: NDArray
) -> NDArray:
"""
Take the array of predictions, made by the refitted estimators,
on the testing set, and the 1-or-nan array indicating for each training
sample which one to integrate, and aggregate to produce phi-{t}(x_t)
for each training sample x_t.
Parameters
----------
x: ArrayLike of shape (n_samples_test, n_estimators)
Array of predictions, made by the refitted estimators,
for each sample of the testing set.
k: ArrayLike of shape (n_samples_training, n_estimators)
1-or-nan array: indicates whether to integrate the prediction
of a given estimator into the aggregation, for each training
sample.
Returns
-------
ArrayLike of shape (n_samples_test,)
Array of aggregated predictions for each testing sample.
"""

def _pred_multi(self, X: ArrayLike) -> NDArray:
"""
Return a prediction per train sample for each test sample, by
aggregation with matrix ``k_``.
Parameters
----------
X: ArrayLike of shape (n_samples_test, n_features)
Input data
Returns
-------
NDArray of shape (n_samples_test, n_samples_train)
"""

def predict_calib(self, X: ArrayLike) -> NDArray:
"""
Perform predictions on X : the calibration set. This method is
called in the ConformityScore class to compute the conformity scores.
Parameters
----------
X: ArrayLike of shape (n_samples_test, n_features)
Input data
Returns
-------
NDArray of shape (n_samples_test, 1)
The predictions.
"""

def fit(
self,
X: ArrayLike,
y: ArrayLike,
sample_weight: Optional[ArrayLike] = None,
) -> EnsembleEstimator:
"""
Fit the base estimator under the ``single_estimator_`` attribute.
Fit all cross-validated estimator clones
and rearrange them into a list, the ``estimators_`` attribute.
Out-of-fold conformity scores are stored under
the ``conformity_scores_`` attribute.
Parameters
----------
X: ArrayLike of shape (n_samples, n_features)
Input data.
y: ArrayLike of shape (n_samples,)
Input labels.
sample_weight: Optional[ArrayLike] of shape (n_samples,)
Sample weights. If None, then samples are equally weighted.
By default ``None``.
Returns
-------
EnsembleRegressor
The estimator fitted.
"""

def predict(
self,
X: ArrayLike,
ensemble: bool = False,
return_multi_pred: bool = True
) -> Union[NDArray, Tuple[NDArray, NDArray, NDArray]]:
"""
Predict target from X. It also computes the prediction per train sample
for each test sample according to ``self.method``.
Parameters
----------
X: ArrayLike of shape (n_samples, n_features)
Test data.
ensemble: bool
Boolean determining whether the predictions are ensembled or not.
If ``False``, predictions are those of the model trained on the
whole training set.
If ``True``, predictions from perturbed models are aggregated by
the aggregation function specified in the ``agg_function``
attribute.
If ``cv`` is ``"prefit"`` or ``"split"``, ``ensemble`` is ignored.
By default ``False``.
return_multi_pred: bool
Returns
-------
Tuple[NDArray, NDArray, NDArray]
- Predictions
- The multiple predictions for the lower bound of the intervals.
- The multiple predictions for the upper bound of the intervals.
"""
2 changes: 1 addition & 1 deletion mapie/regression/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from mapie._typing import ArrayLike, NDArray
from mapie.conformity_scores import ConformityScore
from .estimator import EnsembleRegressor
from mapie.estimator.estimator import EnsembleRegressor
from mapie.utils import (check_alpha, check_alpha_and_n_samples,
check_conformity_score, check_cv,
check_estimator_fit_predict, check_n_features_in,
Expand Down
Loading

0 comments on commit 1dbf7c6

Please sign in to comment.