Skip to content

Commit

Permalink
ENH: add crf for GP
Browse files Browse the repository at this point in the history
  • Loading branch information
vincentblot28 committed Jul 27, 2023
1 parent 0f3e7f9 commit 68e0feb
Showing 1 changed file with 257 additions and 0 deletions.
257 changes: 257 additions & 0 deletions mapie/conformity_scores/residual_conformity_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,3 +441,260 @@ def get_estimation_distribution(
)
else:
return np.add(y_pred, np.multiply(conformity_scores, r_pred))


class GPConformityScore(ConformityScore):
"""
ConformalResidualFittingScore (CRF) score.
The signed conformity score = (|y - y_pred|) / r_pred. r_pred being the
predicted residual (|y - y_pred|) of the base estimator.
It is calculated by a model that learns to predict these residuals.
The learning is done with the log of the residual and we use the
exponential of the prediction to avoid negative values.
The conformity score is symmetrical and allows the calculation of adaptive
prediction intervals (taking X into account). It is possible to use it
only with split and prefit methods (not with cross methods).
Warning : if the estimator provided is not fitted a subset of the
calibration data will be used to fit the model (20% by default).
Parameters
----------
residual_estimator: Optional[RegressorMixin]
The model that learns to predict the residuals of the base estimator.
It can be any regressor with scikit-learn API (i.e. with ``fit``
and ``predict`` methods).
If ``None``, estimator defaults to a ``LinearRegression`` instance.
prefit: bool
Specify if the ``residual_estimator`` is already fitted or not.
By default ``False``.
split_size: Optional[Union[int, float]]
The proportion of data that is used to fit the ``residual_estimator``.
By default it is the default value of
``sklearn.model_selection.train_test_split`` ie 0.2.
random_state: Optional[Union[int, np.random.RandomState]]
Pseudo random number used for random sampling.
Pass an int for reproducible output across multiple function calls.
By default ``None``.
"""

def __init__(
self,
residual_estimator: Optional[RegressorMixin] = None,
prefit: bool = False,
split_size: Optional[Union[int, float]] = None,
random_state: Optional[Union[int, np.random.RandomState]] = None,
sym: bool = True,
consistency_check: bool = False
) -> None:
super().__init__(sym=sym, consistency_check=consistency_check)
self.prefit = prefit
self.residual_estimator = residual_estimator
self.split_size = split_size
self.random_state = random_state

def _check_estimator(
self, estimator: Optional[RegressorMixin] = None
) -> RegressorMixin:
"""
Check if estimator is ``None``,
and returns a ``LinearRegression`` instance if necessary.
If the ``prefit`` attribute is ``True``,
check if estimator is indeed already fitted.
Parameters
----------
estimator: Optional[RegressorMixin]
Estimator to check, by default ``None``.
Returns
-------
RegressorMixin
The estimator itself or a default ``LinearRegression`` instance.
Raises
------
ValueError
If the estimator is not ``None``
and has no ``fit`` nor ``predict`` methods.
NotFittedError
If the estimator is not fitted
and ``prefit`` attribute is ``True``.
"""
if estimator is None:
return LinearRegression()
else:
if not (hasattr(estimator, "fit") and
hasattr(estimator, "predict")):
raise ValueError(
"Invalid estimator. "
"Please provide a regressor with fit and predict methods."
)
if self.prefit:
if isinstance(estimator, Pipeline):
check_is_fitted(estimator[-1])
else:
check_is_fitted(estimator)
return estimator

def _check_parameters(
self,
X: ArrayLike,
y: ArrayLike,
y_pred: ArrayLike
) -> Tuple[NDArray, NDArray, NDArray, RegressorMixin,
Union[int, np.random.RandomState]]:
"""
Checks all the parameters of the class. Raises an error if the
parameter are not well defined.
Parameters
----------
X: ArrayLike
Observed values.
y: ArrayLike
Target values.
y_pred: ArrayLike
Predicted targets.
Returns
-------
Tuple[NDArray, NDArray, NDArray, RegressorMixin]
Well initiated and typed :
- X
- y
- y_pred
- residual_estimator
- random_state
"""
residual_estimator = self._check_estimator(
self.residual_estimator
)
random_state = check_random_state(self.random_state)
X, y, y_pred = indexable(X, y, y_pred)
X = np.array(X)
y = np.array(y)
y_pred = np.array(y_pred)
return X, y, y_pred, residual_estimator, random_state

def _fit_residual_estimator(
self,
residual_estimator_: RegressorMixin,
X: NDArray,
y: NDArray,
y_pred: NDArray,
) -> Tuple[NDArray, NDArray]:
"""
Fit the residual estimator and returns the indexes used for the
training of the base estimator and those needed for the calibration.
Parameters
----------
X: NDArray
All the observed values used in the general fit.
y: NDArray
All the observed targets used in the general fit.
y_pred: NDArray
Predicted targets.
Returns
-------
RegressorMixin
Fitted residual estimator
"""
residuals = np.abs(np.subtract(y, y_pred))
targets = np.log(np.maximum(
residuals,
np.full(residuals.shape, self.eps)
))

residual_estimator_ = residual_estimator_.fit(X, targets)

return residual_estimator_

def get_signed_conformity_scores(
self,
X: ArrayLike,
y: ArrayLike,
y_pred: ArrayLike
) -> NDArray:
"""
Computes the signed conformity score = (y - y_pred) / r_pred.
r_pred being the predicted residual (y - y_pred) of the estimator.
It is calculated by a model (``residual_estimator_``) that learns
to predict this residual.
The learning is done with the log of the residual and later we
use the exponential of the prediction to avoid negative values.
"""
(X, y, y_pred,
self.residual_estimator_,
random_state) = self._check_parameters(X, y, y_pred)

full_indexes = np.argwhere(
np.logical_not(np.isnan(y_pred))
).reshape((-1,))

if not self.prefit:
cal_indexes, res_indexes = train_test_split(
full_indexes,
test_size=self.split_size,
random_state=random_state,
)
self.residual_estimator_ = self._fit_residual_estimator(
clone(self.residual_estimator_),
X[res_indexes], y[res_indexes], y_pred[res_indexes]
)
else:
cal_indexes = full_indexes

_, std_estim = self.residual_estimator.predict(X[cal_indexes], return_std=True)
residuals_pred = np.maximum(
std_estim,
self.eps
)
signed_conformity_scores = np.divide(
np.abs(np.subtract(y[cal_indexes], y_pred[cal_indexes])),
residuals_pred
)

# reconstruct array with nan and conformity scores
complete_signed_cs = np.full(
y_pred.shape, fill_value=np.nan, dtype=float
)
complete_signed_cs[cal_indexes] = signed_conformity_scores

return complete_signed_cs

def get_estimation_distribution(
self,
X: ArrayLike,
y_pred: ArrayLike,
conformity_scores: ArrayLike
) -> NDArray:
"""
Compute samples of the estimation distribution from the predicted
values and the conformity scores, from the following formula:
``y_pred + conformity_scores * r_pred``.
The learning has been done with the log of the residual so we use the
exponential of the prediction to avoid negative values.
``conformity_scores`` can be either the conformity scores or
the quantile of the conformity scores.
"""
# import pdb; pdb.set_trace()
_, r_pred = self.residual_estimator_.predict(X, return_std=True)
# import pdb; pdb.set_trace()
print(conformity_scores)
return np.add(y_pred, np.multiply(conformity_scores, r_pred.reshape(-1, 1)))

0 comments on commit 68e0feb

Please sign in to comment.