From 867a386ed740d5b9b973020bda11728ccc540183 Mon Sep 17 00:00:00 2001 From: Candice Moyet Date: Mon, 24 Jul 2023 17:49:29 +0200 Subject: [PATCH] ENH: beautify handling of nan in crf score --- .../residual_conformity_scores.py | 65 ++++++------------- mapie/tests/test_conformity_scores.py | 6 +- 2 files changed, 23 insertions(+), 48 deletions(-) diff --git a/mapie/conformity_scores/residual_conformity_scores.py b/mapie/conformity_scores/residual_conformity_scores.py index 679416df..ed22d0a9 100644 --- a/mapie/conformity_scores/residual_conformity_scores.py +++ b/mapie/conformity_scores/residual_conformity_scores.py @@ -158,7 +158,7 @@ class ConformalResidualFittingScore(ConformityScore): only with split and prefit methods (not with cross methods). Warning : if the estimator provided is not fitted a subset of the - calibration data will be used to fit the model (50% by default). + calibration data will be used to fit the model (20% by default). Parameters ---------- @@ -291,8 +291,6 @@ def _fit_residual_estimator( X: NDArray, y: NDArray, y_pred: NDArray, - full_indexes: NDArray, - random_state: Optional[Union[int, np.random.RandomState]] ) -> Tuple[NDArray, NDArray]: """ Fit the residual estimator and returns the indexes used for the @@ -309,47 +307,20 @@ def _fit_residual_estimator( y_pred: NDArray Predicted targets. - full_indexes: NDArray - Indexes used for the training of the estimator and the calibration. - - random_state: Optional[Union[int, np.random.RandomState]] - Random state. Returns ------- - Tuple[NDArray, NDArray] - - indexes needed for the calibration. - - indexes used for the training of the base estimator. + RegressorMixin + Fitted residual estimator """ - (X_res_indexes, - X_cal_indexes, - y_res_indexes, - y_cal_indexes) = train_test_split( - full_indexes, - full_indexes, - test_size=self.split_size, - random_state=random_state, - ) - - residuals = np.abs(np.subtract( - y[y_res_indexes], - y_pred[y_res_indexes] - )) - residual_estimator_targets = np.log(np.maximum( + residuals = np.abs(np.subtract(y, y_pred)) + targets = np.log(np.maximum( residuals, np.full(residuals.shape, self.eps) )) - residual_estimator_ = residual_estimator_.fit( - X[X_res_indexes], - residual_estimator_targets - ) + residual_estimator_ = residual_estimator_.fit(X, targets) - cal_index = X_cal_indexes - train_index = list(set(np.arange(y_pred.shape[0])) - set(cal_index)) - - self.residual_estimator_ = residual_estimator_ - - return cal_index, np.array(train_index) + return residual_estimator_ def get_signed_conformity_scores( self, @@ -375,28 +346,34 @@ def get_signed_conformity_scores( ).reshape((-1,)) if not self.prefit: - cal_indexes, train_indexes = self._fit_residual_estimator( - clone(self.residual_estimator_), X, y, y_pred, full_indexes, - random_state + cal_indexes, res_indexes = train_test_split( + full_indexes, + test_size=self.split_size, + random_state=random_state, + ) + self.residual_estimator_ = self._fit_residual_estimator( + clone(self.residual_estimator_), + X[res_indexes], y[res_indexes], y_pred[res_indexes] ) else: cal_indexes = full_indexes - train_indexes = np.argwhere(np.isnan(y_pred)).reshape((-1,)) residuals_pred = np.maximum( np.exp(self.residual_estimator_.predict(X[cal_indexes])), self.eps ) signed_conformity_scores = np.divide( - np.subtract(y[cal_indexes], y_pred[cal_indexes]), + np.abs(np.subtract(y[cal_indexes], y_pred[cal_indexes])), residuals_pred ) # reconstruct array with nan and conformity scores - complete_signed_cs = np.zeros_like(y_pred, dtype=float) + complete_signed_cs = np.full( + y_pred.shape, fill_value=np.nan, dtype=float + ) complete_signed_cs[cal_indexes] = signed_conformity_scores - complete_signed_cs[train_indexes] = np.nan - return signed_conformity_scores + + return complete_signed_cs def get_estimation_distribution( self, diff --git a/mapie/tests/test_conformity_scores.py b/mapie/tests/test_conformity_scores.py index 3d1e6704..e8a819e9 100644 --- a/mapie/tests/test_conformity_scores.py +++ b/mapie/tests/test_conformity_scores.py @@ -2,7 +2,6 @@ import pytest from sklearn.linear_model import LinearRegression -from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures @@ -254,7 +253,7 @@ def test_crf_conformity_score_get_conformity_scores(y_pred: NDArray) -> None: X_toy, y_toy, y_pred ) expected_signed_conf_scores = np.array( - [0.38167789, 0.] + [np.nan, np.nan, 1.e+08, 1.e+08, 0.e+00, 3.e+08] ) np.testing.assert_allclose(conf_scores, expected_signed_conf_scores) @@ -276,8 +275,7 @@ def test_crf_score_prefit_with_default_params() -> None: conf_scores = crf_conf_score.get_conformity_scores( X_toy, y_toy, y_pred_list ) - _, X, _, y = train_test_split(X_toy, y_toy, test_size=0.2) - crf_conf_score.get_estimation_distribution(X, y, conf_scores) + crf_conf_score.get_estimation_distribution(X_toy, y_toy, conf_scores) def test_invalid_estimator() -> None: