From 867a386ed740d5b9b973020bda11728ccc540183 Mon Sep 17 00:00:00 2001
From: Candice Moyet <cmoyet@quantmetry.com>
Date: Mon, 24 Jul 2023 17:49:29 +0200
Subject: [PATCH] ENH: beautify handling of nan in crf score

---
 .../residual_conformity_scores.py             | 65 ++++++-------------
 mapie/tests/test_conformity_scores.py         |  6 +-
 2 files changed, 23 insertions(+), 48 deletions(-)

diff --git a/mapie/conformity_scores/residual_conformity_scores.py b/mapie/conformity_scores/residual_conformity_scores.py
index 679416df..ed22d0a9 100644
--- a/mapie/conformity_scores/residual_conformity_scores.py
+++ b/mapie/conformity_scores/residual_conformity_scores.py
@@ -158,7 +158,7 @@ class ConformalResidualFittingScore(ConformityScore):
     only with split and prefit methods (not with cross methods).
 
     Warning : if the estimator provided is not fitted a subset of the
-    calibration data will be used to fit the model (50% by default).
+    calibration data will be used to fit the model (20% by default).
 
     Parameters
     ----------
@@ -291,8 +291,6 @@ def _fit_residual_estimator(
         X: NDArray,
         y: NDArray,
         y_pred: NDArray,
-        full_indexes: NDArray,
-        random_state: Optional[Union[int, np.random.RandomState]]
     ) -> Tuple[NDArray, NDArray]:
         """
         Fit the residual estimator and returns the indexes used for the
@@ -309,47 +307,20 @@ def _fit_residual_estimator(
         y_pred: NDArray
             Predicted targets.
 
-        full_indexes: NDArray
-            Indexes used for the training of the estimator and the calibration.
-
-        random_state: Optional[Union[int, np.random.RandomState]]
-            Random state.
         Returns
         -------
-        Tuple[NDArray, NDArray]
-            - indexes needed for the calibration.
-            - indexes used for the training of the base estimator.
+        RegressorMixin
+            Fitted residual estimator
         """
-        (X_res_indexes,
-         X_cal_indexes,
-         y_res_indexes,
-         y_cal_indexes) = train_test_split(
-            full_indexes,
-            full_indexes,
-            test_size=self.split_size,
-            random_state=random_state,
-        )
-
-        residuals = np.abs(np.subtract(
-            y[y_res_indexes],
-            y_pred[y_res_indexes]
-        ))
-        residual_estimator_targets = np.log(np.maximum(
+        residuals = np.abs(np.subtract(y, y_pred))
+        targets = np.log(np.maximum(
             residuals,
             np.full(residuals.shape, self.eps)
         ))
 
-        residual_estimator_ = residual_estimator_.fit(
-            X[X_res_indexes],
-            residual_estimator_targets
-        )
+        residual_estimator_ = residual_estimator_.fit(X, targets)
 
-        cal_index = X_cal_indexes
-        train_index = list(set(np.arange(y_pred.shape[0])) - set(cal_index))
-
-        self.residual_estimator_ = residual_estimator_
-
-        return cal_index, np.array(train_index)
+        return residual_estimator_
 
     def get_signed_conformity_scores(
         self,
@@ -375,28 +346,34 @@ def get_signed_conformity_scores(
         ).reshape((-1,))
 
         if not self.prefit:
-            cal_indexes, train_indexes = self._fit_residual_estimator(
-                clone(self.residual_estimator_), X, y, y_pred, full_indexes,
-                random_state
+            cal_indexes, res_indexes = train_test_split(
+                full_indexes,
+                test_size=self.split_size,
+                random_state=random_state,
+            )
+            self.residual_estimator_ = self._fit_residual_estimator(
+                clone(self.residual_estimator_),
+                X[res_indexes], y[res_indexes], y_pred[res_indexes]
             )
         else:
             cal_indexes = full_indexes
-            train_indexes = np.argwhere(np.isnan(y_pred)).reshape((-1,))
 
         residuals_pred = np.maximum(
             np.exp(self.residual_estimator_.predict(X[cal_indexes])),
             self.eps
         )
         signed_conformity_scores = np.divide(
-            np.subtract(y[cal_indexes], y_pred[cal_indexes]),
+            np.abs(np.subtract(y[cal_indexes], y_pred[cal_indexes])),
             residuals_pred
         )
 
         # reconstruct array with nan and conformity scores
-        complete_signed_cs = np.zeros_like(y_pred, dtype=float)
+        complete_signed_cs = np.full(
+            y_pred.shape, fill_value=np.nan, dtype=float
+        )
         complete_signed_cs[cal_indexes] = signed_conformity_scores
-        complete_signed_cs[train_indexes] = np.nan
-        return signed_conformity_scores
+
+        return complete_signed_cs
 
     def get_estimation_distribution(
         self,
diff --git a/mapie/tests/test_conformity_scores.py b/mapie/tests/test_conformity_scores.py
index 3d1e6704..e8a819e9 100644
--- a/mapie/tests/test_conformity_scores.py
+++ b/mapie/tests/test_conformity_scores.py
@@ -2,7 +2,6 @@
 import pytest
 
 from sklearn.linear_model import LinearRegression
-from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import PolynomialFeatures
 
@@ -254,7 +253,7 @@ def test_crf_conformity_score_get_conformity_scores(y_pred: NDArray) -> None:
         X_toy, y_toy, y_pred
     )
     expected_signed_conf_scores = np.array(
-        [0.38167789, 0.]
+        [np.nan, np.nan, 1.e+08, 1.e+08, 0.e+00, 3.e+08]
     )
     np.testing.assert_allclose(conf_scores, expected_signed_conf_scores)
 
@@ -276,8 +275,7 @@ def test_crf_score_prefit_with_default_params() -> None:
     conf_scores = crf_conf_score.get_conformity_scores(
         X_toy, y_toy, y_pred_list
     )
-    _, X, _, y = train_test_split(X_toy, y_toy, test_size=0.2)
-    crf_conf_score.get_estimation_distribution(X, y, conf_scores)
+    crf_conf_score.get_estimation_distribution(X_toy, y_toy, conf_scores)
 
 
 def test_invalid_estimator() -> None: