From 1eadfa56283557eb6106232ad7eb5e84e6b5600f Mon Sep 17 00:00:00 2001 From: Candice Moyet Date: Mon, 10 Jul 2023 10:41:38 +0200 Subject: [PATCH] FIX: estimator interface --- mapie/estimator/estimator.py | 11 +- mapie/estimator/interface.py | 256 ----------------------------------- 2 files changed, 3 insertions(+), 264 deletions(-) diff --git a/mapie/estimator/estimator.py b/mapie/estimator/estimator.py index 167bd2a63..ed05b850d 100644 --- a/mapie/estimator/estimator.py +++ b/mapie/estimator/estimator.py @@ -205,13 +205,8 @@ def _fit_oof_estimator( Returns ------- - Tuple[RegressorMixin, NDArray, ArrayLike] - - - [0]: RegressorMixin, fitted estimator - - [1]: NDArray of shape (n_samples_val,), - estimator predictions on the validation fold. - - [2]: ArrayLike of shape (n_samples_val,), - validation data indices. + RegressorMixin + Fitted estimator. """ X_train = _safe_indexing(X, train_index) y_train = _safe_indexing(y, train_index) @@ -229,7 +224,7 @@ def _predict_oof_estimator( estimator: RegressorMixin, X: ArrayLike, val_index: ArrayLike, - ): + ) -> Tuple[NDArray, ArrayLike]: """ Perform predictions on a single out-of-fold model on a validation set. diff --git a/mapie/estimator/interface.py b/mapie/estimator/interface.py index 25435fe6a..dafbb382d 100644 --- a/mapie/estimator/interface.py +++ b/mapie/estimator/interface.py @@ -12,263 +12,7 @@ class EnsembleEstimator(RegressorMixin): This class implements methods to handle the training and usage of the estimator. This estimator can be unique or composed by cross validated estimators. - - Parameters - ---------- - estimator: Optional[RegressorMixin] - Any regressor with scikit-learn API - (i.e. with ``fit`` and ``predict`` methods). - If ``None``, estimator defaults to a ``LinearRegression`` instance. - - By default ``None``. - - method: str - Method to choose for prediction interval estimates. - Choose among: - - - ``"naive"``, based on training set conformity scores, - - ``"base"``, based on validation sets conformity scores, - - ``"plus"``, based on validation conformity scores and - testing predictions, - - ``"minmax"``, based on validation conformity scores and - testing predictions (min/max among cross-validation clones). - - By default ``"plus"``. - - cv: Optional[Union[int, str, BaseCrossValidator]] - The cross-validation strategy for computing conformity scores. - It directly drives the distinction between jackknife and cv variants. - Choose among: - - - ``None``, to use the default 5-fold cross-validation - - integer, to specify the number of folds. - If equal to ``-1``, equivalent to - ``sklearn.model_selection.LeaveOneOut()``. - - CV splitter: any ``sklearn.model_selection.BaseCrossValidator`` - Main variants are: - - ``sklearn.model_selection.LeaveOneOut`` (jackknife), - - ``sklearn.model_selection.KFold`` (cross-validation), - - ``subsample.Subsample`` object (bootstrap). - - ``"split"``, does not involve cross-validation but a division - of the data into training and calibration subsets. The splitter - used is the following: ``sklearn.model_selection.ShuffleSplit``. - - ``"prefit"``, assumes that ``estimator`` has been fitted already, - and the ``method`` parameter is ignored. - All data provided in the ``fit`` method is then used - for computing conformity scores only. - At prediction time, quantiles of these conformity scores are used - to provide a prediction interval with fixed width. - The user has to take care manually that data for model fitting and - conformity scores estimate are disjoint. - - By default ``None``. - - test_size: Optional[Union[int, float]] - If ``float``, should be between ``0.0`` and ``1.0`` and represent the - proportion of the dataset to include in the test split. If ``int``, - represents the absolute number of test samples. If ``None``, - it will be set to ``0.1``. - - If cv is not ``"split"``, ``test_size`` is ignored. - - By default ``None``. - - n_jobs: Optional[int] - Number of jobs for parallel processing using joblib - via the "locky" backend. - If ``-1`` all CPUs are used. - If ``1`` is given, no parallel computing code is used at all, - which is useful for debugging. - For ``n_jobs`` below ``-1``, ``(n_cpus + 1 - n_jobs)`` are used. - ``None`` is a marker for `unset` that will be interpreted as - ``n_jobs=1`` (sequential execution). - - By default ``None``. - - agg_function: Optional[str] - Determines how to aggregate predictions from perturbed models, both at - training and prediction time. - - If ``None``, it is ignored except if ``cv`` class is ``Subsample``, - in which case an error is raised. - If ``"mean"`` or ``"median"``, returns the mean or median of the - predictions computed from the out-of-folds models. - Note: if you plan to set the ``ensemble`` argument to ``True`` in the - ``predict`` method, you have to specify an aggregation function. - Otherwise an error would be raised. - - The Jackknife+ interval can be interpreted as an interval around the - median prediction, and is guaranteed to lie inside the interval, - unlike the single estimator predictions. - - When the cross-validation strategy is ``Subsample`` (i.e. for the - Jackknife+-after-Bootstrap method), this function is also used to - aggregate the training set in-sample predictions. - - If ``cv`` is ``"prefit"`` or ``"split"``, ``agg_function`` is ignored. - - By default ``"mean"``. - - verbose: int - The verbosity level, used with joblib for multiprocessing. - The frequency of the messages increases with the verbosity level. - If it more than ``10``, all iterations are reported. - Above ``50``, the output is sent to stdout. - - By default ``0``. - - random_state: Optional[Union[int, RandomState]] - Pseudo random number generator state used for random sampling. - Pass an int for reproducible output across multiple function calls. - - By default ``None``. - - Attributes - ---------- - single_estimator_: sklearn.RegressorMixin - Estimator fitted on the whole training set. - - estimators_: list - List of out-of-folds estimators. - - k_: ArrayLike - - Array of nans, of shape (len(y), 1) if ``cv`` is ``"prefit"`` - (defined but not used) - - Dummy array of folds containing each training sample, otherwise. - Of shape (n_samples_train, cv.get_n_splits(X_train, y_train)). """ - no_agg_cv_ = ["prefit", "split"] - no_agg_methods_ = ["naive", "base"] - fit_attributes = [ - "single_estimator_", - "estimators_", - "k_", - ] - - @staticmethod - def _fit_oof_estimator( - estimator: RegressorMixin, - X: ArrayLike, - y: ArrayLike, - train_index: ArrayLike, - sample_weight: Optional[ArrayLike] = None, - ) -> RegressorMixin: - """ - Fit a single out-of-fold model on a given training set. - - Parameters - ---------- - estimator: RegressorMixin - Estimator to train. - - X: ArrayLike of shape (n_samples, n_features) - Input data. - - y: ArrayLike of shape (n_samples,) - Input labels. - - train_index: ArrayLike of shape (n_samples_train) - Training data indices. - - sample_weight: Optional[ArrayLike] of shape (n_samples,) - Sample weights. If None, then samples are equally weighted. - By default ``None``. - - Returns - ------- - Tuple[RegressorMixin, NDArray, ArrayLike] - - - [0]: RegressorMixin, fitted estimator - - [1]: NDArray of shape (n_samples_val,), - estimator predictions on the validation fold. - - [2]: ArrayLike of shape (n_samples_val,), - validation data indices. - """ - - @staticmethod - def _predict_oof_estimator( - estimator: RegressorMixin, - X: ArrayLike, - val_index: ArrayLike, - ): - """ - Perform predictions on a single out-of-fold model on a validation set. - - Parameters - ---------- - estimator: RegressorMixin - Estimator to train. - - X: ArrayLike of shape (n_samples, n_features) - Input data. - - val_index: ArrayLike of shape (n_samples_val) - Validation data indices. - - Returns - ------- - Tuple[NDArray, ArrayLike] - Predictions of estimator from val_index of X. - """ - - def _aggregate_with_mask( - self, - x: NDArray, - k: NDArray - ) -> NDArray: - """ - Take the array of predictions, made by the refitted estimators, - on the testing set, and the 1-or-nan array indicating for each training - sample which one to integrate, and aggregate to produce phi-{t}(x_t) - for each training sample x_t. - - Parameters - ---------- - x: ArrayLike of shape (n_samples_test, n_estimators) - Array of predictions, made by the refitted estimators, - for each sample of the testing set. - - k: ArrayLike of shape (n_samples_training, n_estimators) - 1-or-nan array: indicates whether to integrate the prediction - of a given estimator into the aggregation, for each training - sample. - - Returns - ------- - ArrayLike of shape (n_samples_test,) - Array of aggregated predictions for each testing sample. - """ - - def _pred_multi(self, X: ArrayLike) -> NDArray: - """ - Return a prediction per train sample for each test sample, by - aggregation with matrix ``k_``. - - Parameters - ---------- - X: ArrayLike of shape (n_samples_test, n_features) - Input data - - Returns - ------- - NDArray of shape (n_samples_test, n_samples_train) - """ - - def predict_calib(self, X: ArrayLike) -> NDArray: - """ - Perform predictions on X : the calibration set. This method is - called in the ConformityScore class to compute the conformity scores. - - Parameters - ---------- - X: ArrayLike of shape (n_samples_test, n_features) - Input data - - Returns - ------- - NDArray of shape (n_samples_test, 1) - The predictions. - """ def fit( self,