fix: handle nas in bins (#418)

- [ ] I have battle-tested on Overtaci (RMAPPS1279) - [ ] I have assigned ranges (e.g. `>=0.1, <0.2`) to all new dependencies (allows dependabot to keep dependency ranges wide for better compatibility) - [ ] At least one of the commits is prefixed with either "fix:" or "feat:" ## Notes for reviewers Reviewers can skip X, but should pay attention to Y.
Aarhus-Psychiatry-Research · Mar 10, 2023 · ee4ab7f · ee4ab7f
2 parents bca51d5 + 98df699
commit ee4ab7f
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 33 deletions.
diff --git a/src/psycop_model_training/model_eval/base_artifacts/plots/performance_over_time.py b/src/psycop_model_training/model_eval/base_artifacts/plots/performance_over_time.py
@@ -283,6 +283,7 @@ def create_performance_by_time_from_event_df(
     bins: Sequence[float],
     bin_continuous_input: Optional[bool] = True,
     drop_na_events: Optional[bool] = True,
+    min_n_in_bin: int = 5,
 ) -> pd.DataFrame:
     """Create dataframe for plotting performance metric from time to or from
     some event (e.g. time of diagnosis, time from first visit).
@@ -298,6 +299,7 @@ def create_performance_by_time_from_event_df(
         bins (Iterable[float]): Bins to group by.
         bin_continuous_input (bool, optional): Whether to bin input. Defaults to True.
         drop_na_events (bool, optional): Whether to drop rows where the event is NA. Defaults to True.
+        min_n_in_bin (int, optional): Minimum number of rows in a bin to include in output. Defaults to 10.
 
     Returns:
         pd.DataFrame: Dataframe ready for plotting
@@ -343,6 +345,7 @@ def create_performance_by_time_from_event_df(
         df["days_from_event_binned"], df["n_in_bin"] = bin_continuous_data(
             df["days_from_event"],
             bins=bins,
+            min_n_in_bin=min_n_in_bin,
         )
     else:
         df["days_from_event_binned"] = round_floats_to_edge(
@@ -460,6 +463,7 @@ def plot_metric_by_time_until_diagnosis(
         direction="event-prediction",
         bins=bins,
         bin_continuous_input=bin_continuous_input,
+        min_n_in_bin=0,
         drop_na_events=True,
         metric_fn=metric_fn,
     )

diff --git a/src/psycop_model_training/utils/utils.py b/src/psycop_model_training/utils/utils.py
@@ -2,6 +2,7 @@
 
 utilities.
 """
+import math
 import sys
 import tempfile
 from collections.abc import Iterable, MutableMapping, Sequence
@@ -141,35 +142,21 @@ def bin_continuous_data(
         use_min_as_label (bool, optional): If True, the minimum value in the bin is used as the label. If False, the maximum value is used. Defaults to False.
 
     Returns:
+        Two ungrouped series, e.g. a row for each observation in the original dataset, each containing:
+
         pd.Series: Binned categories for values in data
         pd.Series: Number of samples in binned category
-
-    Example:
-    >>> ages = pd.Series([15, 18, 20, 30, 32, 40, 50, 60, 61])
-    >>> age_bins = [0, 18, 30, 50, 110]
-    >>> bin_Age(ages, age_bins)
-    0     0-18
-    1     0-18
-    2    19-30
-    3    19-30
-    4    31-50
-    5    31-50
-    6    31-50
-    7      51+
-    8      51+
     """
     labels = []
 
     if not isinstance(bins, list):
         bins = list(bins)
 
-    # Handle if series is only NaNs
-    if series.isna().all():
-        return pd.Series(np.nan), pd.Series(np.nan)
-
     # Append maximum value from series to bins set upper cut-off if larger than maximum bins value
-    if int(series.max()) > max(bins):
-        bins.append(int(series.max()))
+    if not series.isna().all() and series.max() > max(bins):
+        # Round max value up
+        max_value_rounded = math.ceil(series.max())
+        bins.append(max_value_rounded)
 
     # Create bin labels
     for i, bin_v in enumerate(bins):
@@ -201,9 +188,18 @@ def bin_continuous_data(
         },
     )
 
-    df["n_in_bin"] = df.groupby("bin").transform("size")
+    # Drop all rows where bin is NaN
+    df = df.dropna()
+
+    # Add a column with counts for the bin each row belongs to
+    df["n_in_bin"] = df.groupby("bin")["bin"].transform("count").reset_index(drop=True)
 
-    df = df.mask(df["n_in_bin"] < min_n_in_bin)
+    # Mask n_in_bin if less than min_n_in_bin
+    df["n_in_bin"] = (
+        df["n_in_bin"]
+        .mask(df["n_in_bin"] < min_n_in_bin, np.nan)
+        .reset_index(drop=True)
+    )
 
     return df["bin"], df["n_in_bin"]
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -99,27 +99,30 @@ def test_bin_contiuous_data():
     one_to_five = pd.Series([1, 2, 3, 4, 5])
 
     # One bin, more than 5
-    one_bin_more_than_five, _ = bin_continuous_data(
+    bins, samples_in_bins = bin_continuous_data(
         series=one_to_five,
         bins=[0, 5],
     )
-    assert len(one_bin_more_than_five.unique()) == 1
-    assert one_bin_more_than_five.isna().sum() == 0
+    # Check that all values equal 1+
+    assert bins.unique() == "1+"
+    assert samples_in_bins.unique() == 5
 
     # One bin, less than 5
     one_to_four = pd.Series([1, 2, 3, 4])
-    one_bin_less_than_five, _ = bin_continuous_data(series=one_to_four, bins=[0, 5])
-    assert one_bin_less_than_five.isna().sum() == 4
+    bins, samples_in_bins = bin_continuous_data(series=one_to_four, bins=[0, 5])
+    assert bins.unique() == "1+"
+    assert samples_in_bins.isna().all()
 
     # Two bins, less than 5
-    two_bins_less_than_five, _ = bin_continuous_data(series=one_to_four, bins=[0, 2, 5])
-    assert two_bins_less_than_five.isna().sum() == 4
+    bins, samples_in_bins = bin_continuous_data(series=one_to_four, bins=[0, 2, 5])
+    assert (bins.unique() == ["0-2", "3+"]).all()
+    assert samples_in_bins.isna().all()
 
     # Two bins, more than 5
-    one_to_ten = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
-    two_bins_more_than_five, _ = bin_continuous_data(series=one_to_ten, bins=[0, 5, 11])
-    assert len(two_bins_more_than_five.unique()) == 2
-    assert two_bins_more_than_five.isna().sum() == 0
+    one_to_ten = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10])
+    bins, n_in_bins = bin_continuous_data(series=one_to_ten, bins=[0, 5, 11])
+    assert (bins.unique() == ["0-5", "6+"]).all()
+    assert bins.isna().sum() == 0
 
     # Series is only NaNs
     nans = pd.Series([np.nan, np.nan, np.nan])