Skip to content

Commit

Permalink
FIX: estimator interface
Browse files Browse the repository at this point in the history
  • Loading branch information
Candice Moyet committed Jul 10, 2023
1 parent 5cc17fe commit 1eadfa5
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 264 deletions.
11 changes: 3 additions & 8 deletions mapie/estimator/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,8 @@ def _fit_oof_estimator(
Returns
-------
Tuple[RegressorMixin, NDArray, ArrayLike]
- [0]: RegressorMixin, fitted estimator
- [1]: NDArray of shape (n_samples_val,),
estimator predictions on the validation fold.
- [2]: ArrayLike of shape (n_samples_val,),
validation data indices.
RegressorMixin
Fitted estimator.
"""
X_train = _safe_indexing(X, train_index)
y_train = _safe_indexing(y, train_index)
Expand All @@ -229,7 +224,7 @@ def _predict_oof_estimator(
estimator: RegressorMixin,
X: ArrayLike,
val_index: ArrayLike,
):
) -> Tuple[NDArray, ArrayLike]:
"""
Perform predictions on a single out-of-fold model on a validation set.
Expand Down
256 changes: 0 additions & 256 deletions mapie/estimator/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,263 +12,7 @@ class EnsembleEstimator(RegressorMixin):
This class implements methods to handle the training and usage of the
estimator. This estimator can be unique or composed by cross validated
estimators.
Parameters
----------
estimator: Optional[RegressorMixin]
Any regressor with scikit-learn API
(i.e. with ``fit`` and ``predict`` methods).
If ``None``, estimator defaults to a ``LinearRegression`` instance.
By default ``None``.
method: str
Method to choose for prediction interval estimates.
Choose among:
- ``"naive"``, based on training set conformity scores,
- ``"base"``, based on validation sets conformity scores,
- ``"plus"``, based on validation conformity scores and
testing predictions,
- ``"minmax"``, based on validation conformity scores and
testing predictions (min/max among cross-validation clones).
By default ``"plus"``.
cv: Optional[Union[int, str, BaseCrossValidator]]
The cross-validation strategy for computing conformity scores.
It directly drives the distinction between jackknife and cv variants.
Choose among:
- ``None``, to use the default 5-fold cross-validation
- integer, to specify the number of folds.
If equal to ``-1``, equivalent to
``sklearn.model_selection.LeaveOneOut()``.
- CV splitter: any ``sklearn.model_selection.BaseCrossValidator``
Main variants are:
- ``sklearn.model_selection.LeaveOneOut`` (jackknife),
- ``sklearn.model_selection.KFold`` (cross-validation),
- ``subsample.Subsample`` object (bootstrap).
- ``"split"``, does not involve cross-validation but a division
of the data into training and calibration subsets. The splitter
used is the following: ``sklearn.model_selection.ShuffleSplit``.
- ``"prefit"``, assumes that ``estimator`` has been fitted already,
and the ``method`` parameter is ignored.
All data provided in the ``fit`` method is then used
for computing conformity scores only.
At prediction time, quantiles of these conformity scores are used
to provide a prediction interval with fixed width.
The user has to take care manually that data for model fitting and
conformity scores estimate are disjoint.
By default ``None``.
test_size: Optional[Union[int, float]]
If ``float``, should be between ``0.0`` and ``1.0`` and represent the
proportion of the dataset to include in the test split. If ``int``,
represents the absolute number of test samples. If ``None``,
it will be set to ``0.1``.
If cv is not ``"split"``, ``test_size`` is ignored.
By default ``None``.
n_jobs: Optional[int]
Number of jobs for parallel processing using joblib
via the "locky" backend.
If ``-1`` all CPUs are used.
If ``1`` is given, no parallel computing code is used at all,
which is useful for debugging.
For ``n_jobs`` below ``-1``, ``(n_cpus + 1 - n_jobs)`` are used.
``None`` is a marker for `unset` that will be interpreted as
``n_jobs=1`` (sequential execution).
By default ``None``.
agg_function: Optional[str]
Determines how to aggregate predictions from perturbed models, both at
training and prediction time.
If ``None``, it is ignored except if ``cv`` class is ``Subsample``,
in which case an error is raised.
If ``"mean"`` or ``"median"``, returns the mean or median of the
predictions computed from the out-of-folds models.
Note: if you plan to set the ``ensemble`` argument to ``True`` in the
``predict`` method, you have to specify an aggregation function.
Otherwise an error would be raised.
The Jackknife+ interval can be interpreted as an interval around the
median prediction, and is guaranteed to lie inside the interval,
unlike the single estimator predictions.
When the cross-validation strategy is ``Subsample`` (i.e. for the
Jackknife+-after-Bootstrap method), this function is also used to
aggregate the training set in-sample predictions.
If ``cv`` is ``"prefit"`` or ``"split"``, ``agg_function`` is ignored.
By default ``"mean"``.
verbose: int
The verbosity level, used with joblib for multiprocessing.
The frequency of the messages increases with the verbosity level.
If it more than ``10``, all iterations are reported.
Above ``50``, the output is sent to stdout.
By default ``0``.
random_state: Optional[Union[int, RandomState]]
Pseudo random number generator state used for random sampling.
Pass an int for reproducible output across multiple function calls.
By default ``None``.
Attributes
----------
single_estimator_: sklearn.RegressorMixin
Estimator fitted on the whole training set.
estimators_: list
List of out-of-folds estimators.
k_: ArrayLike
- Array of nans, of shape (len(y), 1) if ``cv`` is ``"prefit"``
(defined but not used)
- Dummy array of folds containing each training sample, otherwise.
Of shape (n_samples_train, cv.get_n_splits(X_train, y_train)).
"""
no_agg_cv_ = ["prefit", "split"]
no_agg_methods_ = ["naive", "base"]
fit_attributes = [
"single_estimator_",
"estimators_",
"k_",
]

@staticmethod
def _fit_oof_estimator(
estimator: RegressorMixin,
X: ArrayLike,
y: ArrayLike,
train_index: ArrayLike,
sample_weight: Optional[ArrayLike] = None,
) -> RegressorMixin:
"""
Fit a single out-of-fold model on a given training set.
Parameters
----------
estimator: RegressorMixin
Estimator to train.
X: ArrayLike of shape (n_samples, n_features)
Input data.
y: ArrayLike of shape (n_samples,)
Input labels.
train_index: ArrayLike of shape (n_samples_train)
Training data indices.
sample_weight: Optional[ArrayLike] of shape (n_samples,)
Sample weights. If None, then samples are equally weighted.
By default ``None``.
Returns
-------
Tuple[RegressorMixin, NDArray, ArrayLike]
- [0]: RegressorMixin, fitted estimator
- [1]: NDArray of shape (n_samples_val,),
estimator predictions on the validation fold.
- [2]: ArrayLike of shape (n_samples_val,),
validation data indices.
"""

@staticmethod
def _predict_oof_estimator(
estimator: RegressorMixin,
X: ArrayLike,
val_index: ArrayLike,
):
"""
Perform predictions on a single out-of-fold model on a validation set.
Parameters
----------
estimator: RegressorMixin
Estimator to train.
X: ArrayLike of shape (n_samples, n_features)
Input data.
val_index: ArrayLike of shape (n_samples_val)
Validation data indices.
Returns
-------
Tuple[NDArray, ArrayLike]
Predictions of estimator from val_index of X.
"""

def _aggregate_with_mask(
self,
x: NDArray,
k: NDArray
) -> NDArray:
"""
Take the array of predictions, made by the refitted estimators,
on the testing set, and the 1-or-nan array indicating for each training
sample which one to integrate, and aggregate to produce phi-{t}(x_t)
for each training sample x_t.
Parameters
----------
x: ArrayLike of shape (n_samples_test, n_estimators)
Array of predictions, made by the refitted estimators,
for each sample of the testing set.
k: ArrayLike of shape (n_samples_training, n_estimators)
1-or-nan array: indicates whether to integrate the prediction
of a given estimator into the aggregation, for each training
sample.
Returns
-------
ArrayLike of shape (n_samples_test,)
Array of aggregated predictions for each testing sample.
"""

def _pred_multi(self, X: ArrayLike) -> NDArray:
"""
Return a prediction per train sample for each test sample, by
aggregation with matrix ``k_``.
Parameters
----------
X: ArrayLike of shape (n_samples_test, n_features)
Input data
Returns
-------
NDArray of shape (n_samples_test, n_samples_train)
"""

def predict_calib(self, X: ArrayLike) -> NDArray:
"""
Perform predictions on X : the calibration set. This method is
called in the ConformityScore class to compute the conformity scores.
Parameters
----------
X: ArrayLike of shape (n_samples_test, n_features)
Input data
Returns
-------
NDArray of shape (n_samples_test, 1)
The predictions.
"""

def fit(
self,
Expand Down

0 comments on commit 1eadfa5

Please sign in to comment.