diff --git a/src/psycop_model_training/model_eval/base_artifacts/plots/performance_over_time.py b/src/psycop_model_training/model_eval/base_artifacts/plots/performance_over_time.py index 10847472..823777bf 100644 --- a/src/psycop_model_training/model_eval/base_artifacts/plots/performance_over_time.py +++ b/src/psycop_model_training/model_eval/base_artifacts/plots/performance_over_time.py @@ -283,6 +283,7 @@ def create_performance_by_time_from_event_df( bins: Sequence[float], bin_continuous_input: Optional[bool] = True, drop_na_events: Optional[bool] = True, + min_n_in_bin: int = 5, ) -> pd.DataFrame: """Create dataframe for plotting performance metric from time to or from some event (e.g. time of diagnosis, time from first visit). @@ -298,6 +299,7 @@ def create_performance_by_time_from_event_df( bins (Iterable[float]): Bins to group by. bin_continuous_input (bool, optional): Whether to bin input. Defaults to True. drop_na_events (bool, optional): Whether to drop rows where the event is NA. Defaults to True. + min_n_in_bin (int, optional): Minimum number of rows in a bin to include in output. Defaults to 10. Returns: pd.DataFrame: Dataframe ready for plotting @@ -343,6 +345,7 @@ def create_performance_by_time_from_event_df( df["days_from_event_binned"], df["n_in_bin"] = bin_continuous_data( df["days_from_event"], bins=bins, + min_n_in_bin=min_n_in_bin, ) else: df["days_from_event_binned"] = round_floats_to_edge( @@ -460,6 +463,7 @@ def plot_metric_by_time_until_diagnosis( direction="event-prediction", bins=bins, bin_continuous_input=bin_continuous_input, + min_n_in_bin=0, drop_na_events=True, metric_fn=metric_fn, ) diff --git a/src/psycop_model_training/utils/utils.py b/src/psycop_model_training/utils/utils.py index 9af8128b..4ae44817 100644 --- a/src/psycop_model_training/utils/utils.py +++ b/src/psycop_model_training/utils/utils.py @@ -2,6 +2,7 @@ utilities. """ +import math import sys import tempfile from collections.abc import Iterable, MutableMapping, Sequence @@ -141,35 +142,21 @@ def bin_continuous_data( use_min_as_label (bool, optional): If True, the minimum value in the bin is used as the label. If False, the maximum value is used. Defaults to False. Returns: + Two ungrouped series, e.g. a row for each observation in the original dataset, each containing: + pd.Series: Binned categories for values in data pd.Series: Number of samples in binned category - - Example: - >>> ages = pd.Series([15, 18, 20, 30, 32, 40, 50, 60, 61]) - >>> age_bins = [0, 18, 30, 50, 110] - >>> bin_Age(ages, age_bins) - 0 0-18 - 1 0-18 - 2 19-30 - 3 19-30 - 4 31-50 - 5 31-50 - 6 31-50 - 7 51+ - 8 51+ """ labels = [] if not isinstance(bins, list): bins = list(bins) - # Handle if series is only NaNs - if series.isna().all(): - return pd.Series(np.nan), pd.Series(np.nan) - # Append maximum value from series to bins set upper cut-off if larger than maximum bins value - if int(series.max()) > max(bins): - bins.append(int(series.max())) + if not series.isna().all() and series.max() > max(bins): + # Round max value up + max_value_rounded = math.ceil(series.max()) + bins.append(max_value_rounded) # Create bin labels for i, bin_v in enumerate(bins): @@ -201,9 +188,18 @@ def bin_continuous_data( }, ) - df["n_in_bin"] = df.groupby("bin").transform("size") + # Drop all rows where bin is NaN + df = df.dropna() + + # Add a column with counts for the bin each row belongs to + df["n_in_bin"] = df.groupby("bin")["bin"].transform("count").reset_index(drop=True) - df = df.mask(df["n_in_bin"] < min_n_in_bin) + # Mask n_in_bin if less than min_n_in_bin + df["n_in_bin"] = ( + df["n_in_bin"] + .mask(df["n_in_bin"] < min_n_in_bin, np.nan) + .reset_index(drop=True) + ) return df["bin"], df["n_in_bin"] diff --git a/tests/test_utils.py b/tests/test_utils.py index 91f7a44d..3a820505 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -99,27 +99,30 @@ def test_bin_contiuous_data(): one_to_five = pd.Series([1, 2, 3, 4, 5]) # One bin, more than 5 - one_bin_more_than_five, _ = bin_continuous_data( + bins, samples_in_bins = bin_continuous_data( series=one_to_five, bins=[0, 5], ) - assert len(one_bin_more_than_five.unique()) == 1 - assert one_bin_more_than_five.isna().sum() == 0 + # Check that all values equal 1+ + assert bins.unique() == "1+" + assert samples_in_bins.unique() == 5 # One bin, less than 5 one_to_four = pd.Series([1, 2, 3, 4]) - one_bin_less_than_five, _ = bin_continuous_data(series=one_to_four, bins=[0, 5]) - assert one_bin_less_than_five.isna().sum() == 4 + bins, samples_in_bins = bin_continuous_data(series=one_to_four, bins=[0, 5]) + assert bins.unique() == "1+" + assert samples_in_bins.isna().all() # Two bins, less than 5 - two_bins_less_than_five, _ = bin_continuous_data(series=one_to_four, bins=[0, 2, 5]) - assert two_bins_less_than_five.isna().sum() == 4 + bins, samples_in_bins = bin_continuous_data(series=one_to_four, bins=[0, 2, 5]) + assert (bins.unique() == ["0-2", "3+"]).all() + assert samples_in_bins.isna().all() # Two bins, more than 5 - one_to_ten = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - two_bins_more_than_five, _ = bin_continuous_data(series=one_to_ten, bins=[0, 5, 11]) - assert len(two_bins_more_than_five.unique()) == 2 - assert two_bins_more_than_five.isna().sum() == 0 + one_to_ten = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10]) + bins, n_in_bins = bin_continuous_data(series=one_to_ten, bins=[0, 5, 11]) + assert (bins.unique() == ["0-5", "6+"]).all() + assert bins.isna().sum() == 0 # Series is only NaNs nans = pd.Series([np.nan, np.nan, np.nan])