FIX: estimator interface

scikit-learn-contrib · Jul 10, 2023 · 1eadfa5 · 1eadfa5
1 parent 5cc17fe
commit 1eadfa5
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 264 deletions.
diff --git a/mapie/estimator/estimator.py b/mapie/estimator/estimator.py
@@ -205,13 +205,8 @@ def _fit_oof_estimator(
 
  Returns
  -------
- Tuple[RegressorMixin, NDArray, ArrayLike]
-
- - [0]: RegressorMixin, fitted estimator
- - [1]: NDArray of shape (n_samples_val,),
- estimator predictions on the validation fold.
- - [2]: ArrayLike of shape (n_samples_val,),
- validation data indices.
+ RegressorMixin
+ Fitted estimator.
  """
  X_train = _safe_indexing(X, train_index)
  y_train = _safe_indexing(y, train_index)
@@ -229,7 +224,7 @@ def _predict_oof_estimator(
  estimator: RegressorMixin,
  X: ArrayLike,
  val_index: ArrayLike,
- ):
+ ) -> Tuple[NDArray, ArrayLike]:
  """
  Perform predictions on a single out-of-fold model on a validation set.
 

diff --git a/mapie/estimator/interface.py b/mapie/estimator/interface.py
@@ -12,263 +12,7 @@ class EnsembleEstimator(RegressorMixin):
  This class implements methods to handle the training and usage of the
  estimator. This estimator can be unique or composed by cross validated
  estimators.
-
- Parameters
- ----------
- estimator: Optional[RegressorMixin]
- Any regressor with scikit-learn API
- (i.e. with ``fit`` and ``predict`` methods).
- If ``None``, estimator defaults to a ``LinearRegression`` instance.
-
- By default ``None``.
-
- method: str
- Method to choose for prediction interval estimates.
- Choose among:
-
- - ``"naive"``, based on training set conformity scores,
- - ``"base"``, based on validation sets conformity scores,
- - ``"plus"``, based on validation conformity scores and
- testing predictions,
- - ``"minmax"``, based on validation conformity scores and
- testing predictions (min/max among cross-validation clones).
-
- By default ``"plus"``.
-
- cv: Optional[Union[int, str, BaseCrossValidator]]
- The cross-validation strategy for computing conformity scores.
- It directly drives the distinction between jackknife and cv variants.
- Choose among:
-
- - ``None``, to use the default 5-fold cross-validation
- - integer, to specify the number of folds.
- If equal to ``-1``, equivalent to
- ``sklearn.model_selection.LeaveOneOut()``.
- - CV splitter: any ``sklearn.model_selection.BaseCrossValidator``
- Main variants are:
- - ``sklearn.model_selection.LeaveOneOut`` (jackknife),
- - ``sklearn.model_selection.KFold`` (cross-validation),
- - ``subsample.Subsample`` object (bootstrap).
- - ``"split"``, does not involve cross-validation but a division
- of the data into training and calibration subsets. The splitter
- used is the following: ``sklearn.model_selection.ShuffleSplit``.
- - ``"prefit"``, assumes that ``estimator`` has been fitted already,
- and the ``method`` parameter is ignored.
- All data provided in the ``fit`` method is then used
- for computing conformity scores only.
- At prediction time, quantiles of these conformity scores are used
- to provide a prediction interval with fixed width.
- The user has to take care manually that data for model fitting and
- conformity scores estimate are disjoint.
-
- By default ``None``.
-
- test_size: Optional[Union[int, float]]
- If ``float``, should be between ``0.0`` and ``1.0`` and represent the
- proportion of the dataset to include in the test split. If ``int``,
- represents the absolute number of test samples. If ``None``,
- it will be set to ``0.1``.
-
- If cv is not ``"split"``, ``test_size`` is ignored.
-
- By default ``None``.
-
- n_jobs: Optional[int]
- Number of jobs for parallel processing using joblib
- via the "locky" backend.
- If ``-1`` all CPUs are used.
- If ``1`` is given, no parallel computing code is used at all,
- which is useful for debugging.
- For ``n_jobs`` below ``-1``, ``(n_cpus + 1 - n_jobs)`` are used.
- ``None`` is a marker for `unset` that will be interpreted as
- ``n_jobs=1`` (sequential execution).
-
- By default ``None``.
-
- agg_function: Optional[str]
- Determines how to aggregate predictions from perturbed models, both at
- training and prediction time.
-
- If ``None``, it is ignored except if ``cv`` class is ``Subsample``,
- in which case an error is raised.
- If ``"mean"`` or ``"median"``, returns the mean or median of the
- predictions computed from the out-of-folds models.
- Note: if you plan to set the ``ensemble`` argument to ``True`` in the
- ``predict`` method, you have to specify an aggregation function.
- Otherwise an error would be raised.
-
- The Jackknife+ interval can be interpreted as an interval around the
- median prediction, and is guaranteed to lie inside the interval,
- unlike the single estimator predictions.
-
- When the cross-validation strategy is ``Subsample`` (i.e. for the
- Jackknife+-after-Bootstrap method), this function is also used to
- aggregate the training set in-sample predictions.
-
- If ``cv`` is ``"prefit"`` or ``"split"``, ``agg_function`` is ignored.
-
- By default ``"mean"``.
-
- verbose: int
- The verbosity level, used with joblib for multiprocessing.
- The frequency of the messages increases with the verbosity level.
- If it more than ``10``, all iterations are reported.
- Above ``50``, the output is sent to stdout.
-
- By default ``0``.
-
- random_state: Optional[Union[int, RandomState]]
- Pseudo random number generator state used for random sampling.
- Pass an int for reproducible output across multiple function calls.
-
- By default ``None``.
-
- Attributes
- ----------
- single_estimator_: sklearn.RegressorMixin
- Estimator fitted on the whole training set.
-
- estimators_: list
- List of out-of-folds estimators.
-
- k_: ArrayLike
- - Array of nans, of shape (len(y), 1) if ``cv`` is ``"prefit"``
- (defined but not used)
- - Dummy array of folds containing each training sample, otherwise.
- Of shape (n_samples_train, cv.get_n_splits(X_train, y_train)).
  """
- no_agg_cv_ = ["prefit", "split"]
- no_agg_methods_ = ["naive", "base"]
- fit_attributes = [
- "single_estimator_",
- "estimators_",
- "k_",
- ]
-
- @staticmethod
- def _fit_oof_estimator(
- estimator: RegressorMixin,
- X: ArrayLike,
- y: ArrayLike,
- train_index: ArrayLike,
- sample_weight: Optional[ArrayLike] = None,
- ) -> RegressorMixin:
- """
- Fit a single out-of-fold model on a given training set.
-
- Parameters
- ----------
- estimator: RegressorMixin
- Estimator to train.
-
- X: ArrayLike of shape (n_samples, n_features)
- Input data.
-
- y: ArrayLike of shape (n_samples,)
- Input labels.
-
- train_index: ArrayLike of shape (n_samples_train)
- Training data indices.
-
- sample_weight: Optional[ArrayLike] of shape (n_samples,)
- Sample weights. If None, then samples are equally weighted.
- By default ``None``.
-
- Returns
- -------
- Tuple[RegressorMixin, NDArray, ArrayLike]
-
- - [0]: RegressorMixin, fitted estimator
- - [1]: NDArray of shape (n_samples_val,),
- estimator predictions on the validation fold.
- - [2]: ArrayLike of shape (n_samples_val,),
- validation data indices.
- """
-
- @staticmethod
- def _predict_oof_estimator(
- estimator: RegressorMixin,
- X: ArrayLike,
- val_index: ArrayLike,
- ):
- """
- Perform predictions on a single out-of-fold model on a validation set.
-
- Parameters
- ----------
- estimator: RegressorMixin
- Estimator to train.
-
- X: ArrayLike of shape (n_samples, n_features)
- Input data.
-
- val_index: ArrayLike of shape (n_samples_val)
- Validation data indices.
-
- Returns
- -------
- Tuple[NDArray, ArrayLike]
- Predictions of estimator from val_index of X.
- """
-
- def _aggregate_with_mask(
- self,
- x: NDArray,
- k: NDArray
- ) -> NDArray:
- """
- Take the array of predictions, made by the refitted estimators,
- on the testing set, and the 1-or-nan array indicating for each training
- sample which one to integrate, and aggregate to produce phi-{t}(x_t)
- for each training sample x_t.
-
- Parameters
- ----------
- x: ArrayLike of shape (n_samples_test, n_estimators)
- Array of predictions, made by the refitted estimators,
- for each sample of the testing set.
-
- k: ArrayLike of shape (n_samples_training, n_estimators)
- 1-or-nan array: indicates whether to integrate the prediction
- of a given estimator into the aggregation, for each training
- sample.
-
- Returns
- -------
- ArrayLike of shape (n_samples_test,)
- Array of aggregated predictions for each testing sample.
- """
-
- def _pred_multi(self, X: ArrayLike) -> NDArray:
- """
- Return a prediction per train sample for each test sample, by
- aggregation with matrix ``k_``.
-
- Parameters
- ----------
- X: ArrayLike of shape (n_samples_test, n_features)
- Input data
-
- Returns
- -------
- NDArray of shape (n_samples_test, n_samples_train)
- """
-
- def predict_calib(self, X: ArrayLike) -> NDArray:
- """
- Perform predictions on X : the calibration set. This method is
- called in the ConformityScore class to compute the conformity scores.
-
- Parameters
- ----------
- X: ArrayLike of shape (n_samples_test, n_features)
- Input data
-
- Returns
- -------
- NDArray of shape (n_samples_test, 1)
- The predictions.
- """
 
  def fit(
  self,