ENH: add crf for GP

scikit-learn-contrib · Jul 27, 2023 · 68e0feb · 68e0feb
1 parent 0f3e7f9
commit 68e0feb
Showing 1 changed file with 257 additions and 0 deletions.
diff --git a/mapie/conformity_scores/residual_conformity_scores.py b/mapie/conformity_scores/residual_conformity_scores.py
@@ -441,3 +441,260 @@ def get_estimation_distribution(
  )
  else:
  return np.add(y_pred, np.multiply(conformity_scores, r_pred))
+
+
+class GPConformityScore(ConformityScore):
+ """
+ ConformalResidualFittingScore (CRF) score.
+
+ The signed conformity score = (|y - y_pred|) / r_pred. r_pred being the
+ predicted residual (|y - y_pred|) of the base estimator.
+ It is calculated by a model that learns to predict these residuals.
+ The learning is done with the log of the residual and we use the
+ exponential of the prediction to avoid negative values.
+
+ The conformity score is symmetrical and allows the calculation of adaptive
+ prediction intervals (taking X into account). It is possible to use it
+ only with split and prefit methods (not with cross methods).
+
+ Warning : if the estimator provided is not fitted a subset of the
+ calibration data will be used to fit the model (20% by default).
+
+ Parameters
+ ----------
+ residual_estimator: Optional[RegressorMixin]
+ The model that learns to predict the residuals of the base estimator.
+ It can be any regressor with scikit-learn API (i.e. with ``fit``
+ and ``predict`` methods).
+ If ``None``, estimator defaults to a ``LinearRegression`` instance.
+
+ prefit: bool
+ Specify if the ``residual_estimator`` is already fitted or not.
+ By default ``False``.
+
+ split_size: Optional[Union[int, float]]
+ The proportion of data that is used to fit the ``residual_estimator``.
+ By default it is the default value of
+ ``sklearn.model_selection.train_test_split`` ie 0.2.
+
+ random_state: Optional[Union[int, np.random.RandomState]]
+ Pseudo random number used for random sampling.
+ Pass an int for reproducible output across multiple function calls.
+ By default ``None``.
+ """
+
+ def __init__(
+ self,
+ residual_estimator: Optional[RegressorMixin] = None,
+ prefit: bool = False,
+ split_size: Optional[Union[int, float]] = None,
+ random_state: Optional[Union[int, np.random.RandomState]] = None,
+ sym: bool = True,
+ consistency_check: bool = False
+ ) -> None:
+ super().__init__(sym=sym, consistency_check=consistency_check)
+ self.prefit = prefit
+ self.residual_estimator = residual_estimator
+ self.split_size = split_size
+ self.random_state = random_state
+
+ def _check_estimator(
+ self, estimator: Optional[RegressorMixin] = None
+ ) -> RegressorMixin:
+ """
+ Check if estimator is ``None``,
+ and returns a ``LinearRegression`` instance if necessary.
+ If the ``prefit`` attribute is ``True``,
+ check if estimator is indeed already fitted.
+
+ Parameters
+ ----------
+ estimator: Optional[RegressorMixin]
+ Estimator to check, by default ``None``.
+
+ Returns
+ -------
+ RegressorMixin
+ The estimator itself or a default ``LinearRegression`` instance.
+
+ Raises
+ ------
+ ValueError
+ If the estimator is not ``None``
+ and has no ``fit`` nor ``predict`` methods.
+
+ NotFittedError
+ If the estimator is not fitted
+ and ``prefit`` attribute is ``True``.
+ """
+ if estimator is None:
+ return LinearRegression()
+ else:
+ if not (hasattr(estimator, "fit") and
+ hasattr(estimator, "predict")):
+ raise ValueError(
+ "Invalid estimator. "
+ "Please provide a regressor with fit and predict methods."
+ )
+ if self.prefit:
+ if isinstance(estimator, Pipeline):
+ check_is_fitted(estimator[-1])
+ else:
+ check_is_fitted(estimator)
+ return estimator
+
+ def _check_parameters(
+ self,
+ X: ArrayLike,
+ y: ArrayLike,
+ y_pred: ArrayLike
+ ) -> Tuple[NDArray, NDArray, NDArray, RegressorMixin,
+ Union[int, np.random.RandomState]]:
+ """
+ Checks all the parameters of the class. Raises an error if the
+ parameter are not well defined.
+
+ Parameters
+ ----------
+ X: ArrayLike
+ Observed values.
+
+ y: ArrayLike
+ Target values.
+
+ y_pred: ArrayLike
+ Predicted targets.
+
+ Returns
+ -------
+ Tuple[NDArray, NDArray, NDArray, RegressorMixin]
+ Well initiated and typed :
+ - X
+ - y
+ - y_pred
+ - residual_estimator
+ - random_state
+ """
+ residual_estimator = self._check_estimator(
+ self.residual_estimator
+ )
+ random_state = check_random_state(self.random_state)
+ X, y, y_pred = indexable(X, y, y_pred)
+ X = np.array(X)
+ y = np.array(y)
+ y_pred = np.array(y_pred)
+ return X, y, y_pred, residual_estimator, random_state
+
+ def _fit_residual_estimator(
+ self,
+ residual_estimator_: RegressorMixin,
+ X: NDArray,
+ y: NDArray,
+ y_pred: NDArray,
+ ) -> Tuple[NDArray, NDArray]:
+ """
+ Fit the residual estimator and returns the indexes used for the
+ training of the base estimator and those needed for the calibration.
+
+ Parameters
+ ----------
+ X: NDArray
+ All the observed values used in the general fit.
+
+ y: NDArray
+ All the observed targets used in the general fit.
+
+ y_pred: NDArray
+ Predicted targets.
+
+ Returns
+ -------
+ RegressorMixin
+ Fitted residual estimator
+ """
+ residuals = np.abs(np.subtract(y, y_pred))
+ targets = np.log(np.maximum(
+ residuals,
+ np.full(residuals.shape, self.eps)
+ ))
+
+ residual_estimator_ = residual_estimator_.fit(X, targets)
+
+ return residual_estimator_
+
+ def get_signed_conformity_scores(
+ self,
+ X: ArrayLike,
+ y: ArrayLike,
+ y_pred: ArrayLike
+ ) -> NDArray:
+ """
+ Computes the signed conformity score = (y - y_pred) / r_pred.
+ r_pred being the predicted residual (y - y_pred) of the estimator.
+ It is calculated by a model (``residual_estimator_``) that learns
+ to predict this residual.
+
+ The learning is done with the log of the residual and later we
+ use the exponential of the prediction to avoid negative values.
+ """
+ (X, y, y_pred,
+ self.residual_estimator_,
+ random_state) = self._check_parameters(X, y, y_pred)
+
+ full_indexes = np.argwhere(
+ np.logical_not(np.isnan(y_pred))
+ ).reshape((-1,))
+
+ if not self.prefit:
+ cal_indexes, res_indexes = train_test_split(
+ full_indexes,
+ test_size=self.split_size,
+ random_state=random_state,
+ )
+ self.residual_estimator_ = self._fit_residual_estimator(
+ clone(self.residual_estimator_),
+ X[res_indexes], y[res_indexes], y_pred[res_indexes]
+ )
+ else:
+ cal_indexes = full_indexes
+
+ _, std_estim = self.residual_estimator.predict(X[cal_indexes], return_std=True)
+ residuals_pred = np.maximum(
+ std_estim,
+ self.eps
+ )
+ signed_conformity_scores = np.divide(
+ np.abs(np.subtract(y[cal_indexes], y_pred[cal_indexes])),
+ residuals_pred
+ )
+
+ # reconstruct array with nan and conformity scores
+ complete_signed_cs = np.full(
+ y_pred.shape, fill_value=np.nan, dtype=float
+ )
+ complete_signed_cs[cal_indexes] = signed_conformity_scores
+
+ return complete_signed_cs
+
+ def get_estimation_distribution(
+ self,
+ X: ArrayLike,
+ y_pred: ArrayLike,
+ conformity_scores: ArrayLike
+ ) -> NDArray:
+ """
+ Compute samples of the estimation distribution from the predicted
+ values and the conformity scores, from the following formula:
+ ``y_pred + conformity_scores * r_pred``.
+
+ The learning has been done with the log of the residual so we use the
+ exponential of the prediction to avoid negative values.
+
+ ``conformity_scores`` can be either the conformity scores or
+ the quantile of the conformity scores.
+ """
+ # import pdb; pdb.set_trace()
+ _, r_pred = self.residual_estimator_.predict(X, return_std=True)
+ # import pdb; pdb.set_trace()
+ print(conformity_scores)
+ return np.add(y_pred, np.multiply(conformity_scores, r_pred.reshape(-1, 1)))