Skip to content
This repository has been archived by the owner on May 1, 2023. It is now read-only.

Commit

Permalink
fix: handle nas in bins (#418)
Browse files Browse the repository at this point in the history
- [ ] I have battle-tested on Overtaci (RMAPPS1279)
- [ ] I have assigned ranges (e.g. `>=0.1, <0.2`) to all new
dependencies (allows dependabot to keep dependency ranges wide for
better compatibility)
- [ ] At least one of the commits is prefixed with either "fix:" or
"feat:"

## Notes for reviewers
Reviewers can skip X, but should pay attention to Y.
  • Loading branch information
MartinBernstorff authored Mar 10, 2023
2 parents bca51d5 + 98df699 commit ee4ab7f
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ def create_performance_by_time_from_event_df(
bins: Sequence[float],
bin_continuous_input: Optional[bool] = True,
drop_na_events: Optional[bool] = True,
min_n_in_bin: int = 5,
) -> pd.DataFrame:
"""Create dataframe for plotting performance metric from time to or from
some event (e.g. time of diagnosis, time from first visit).
Expand All @@ -298,6 +299,7 @@ def create_performance_by_time_from_event_df(
bins (Iterable[float]): Bins to group by.
bin_continuous_input (bool, optional): Whether to bin input. Defaults to True.
drop_na_events (bool, optional): Whether to drop rows where the event is NA. Defaults to True.
min_n_in_bin (int, optional): Minimum number of rows in a bin to include in output. Defaults to 10.
Returns:
pd.DataFrame: Dataframe ready for plotting
Expand Down Expand Up @@ -343,6 +345,7 @@ def create_performance_by_time_from_event_df(
df["days_from_event_binned"], df["n_in_bin"] = bin_continuous_data(
df["days_from_event"],
bins=bins,
min_n_in_bin=min_n_in_bin,
)
else:
df["days_from_event_binned"] = round_floats_to_edge(
Expand Down Expand Up @@ -460,6 +463,7 @@ def plot_metric_by_time_until_diagnosis(
direction="event-prediction",
bins=bins,
bin_continuous_input=bin_continuous_input,
min_n_in_bin=0,
drop_na_events=True,
metric_fn=metric_fn,
)
Expand Down
40 changes: 18 additions & 22 deletions src/psycop_model_training/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
utilities.
"""
import math
import sys
import tempfile
from collections.abc import Iterable, MutableMapping, Sequence
Expand Down Expand Up @@ -141,35 +142,21 @@ def bin_continuous_data(
use_min_as_label (bool, optional): If True, the minimum value in the bin is used as the label. If False, the maximum value is used. Defaults to False.
Returns:
Two ungrouped series, e.g. a row for each observation in the original dataset, each containing:
pd.Series: Binned categories for values in data
pd.Series: Number of samples in binned category
Example:
>>> ages = pd.Series([15, 18, 20, 30, 32, 40, 50, 60, 61])
>>> age_bins = [0, 18, 30, 50, 110]
>>> bin_Age(ages, age_bins)
0 0-18
1 0-18
2 19-30
3 19-30
4 31-50
5 31-50
6 31-50
7 51+
8 51+
"""
labels = []

if not isinstance(bins, list):
bins = list(bins)

# Handle if series is only NaNs
if series.isna().all():
return pd.Series(np.nan), pd.Series(np.nan)

# Append maximum value from series to bins set upper cut-off if larger than maximum bins value
if int(series.max()) > max(bins):
bins.append(int(series.max()))
if not series.isna().all() and series.max() > max(bins):
# Round max value up
max_value_rounded = math.ceil(series.max())
bins.append(max_value_rounded)

# Create bin labels
for i, bin_v in enumerate(bins):
Expand Down Expand Up @@ -201,9 +188,18 @@ def bin_continuous_data(
},
)

df["n_in_bin"] = df.groupby("bin").transform("size")
# Drop all rows where bin is NaN
df = df.dropna()

# Add a column with counts for the bin each row belongs to
df["n_in_bin"] = df.groupby("bin")["bin"].transform("count").reset_index(drop=True)

df = df.mask(df["n_in_bin"] < min_n_in_bin)
# Mask n_in_bin if less than min_n_in_bin
df["n_in_bin"] = (
df["n_in_bin"]
.mask(df["n_in_bin"] < min_n_in_bin, np.nan)
.reset_index(drop=True)
)

return df["bin"], df["n_in_bin"]

Expand Down
25 changes: 14 additions & 11 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,27 +99,30 @@ def test_bin_contiuous_data():
one_to_five = pd.Series([1, 2, 3, 4, 5])

# One bin, more than 5
one_bin_more_than_five, _ = bin_continuous_data(
bins, samples_in_bins = bin_continuous_data(
series=one_to_five,
bins=[0, 5],
)
assert len(one_bin_more_than_five.unique()) == 1
assert one_bin_more_than_five.isna().sum() == 0
# Check that all values equal 1+
assert bins.unique() == "1+"
assert samples_in_bins.unique() == 5

# One bin, less than 5
one_to_four = pd.Series([1, 2, 3, 4])
one_bin_less_than_five, _ = bin_continuous_data(series=one_to_four, bins=[0, 5])
assert one_bin_less_than_five.isna().sum() == 4
bins, samples_in_bins = bin_continuous_data(series=one_to_four, bins=[0, 5])
assert bins.unique() == "1+"
assert samples_in_bins.isna().all()

# Two bins, less than 5
two_bins_less_than_five, _ = bin_continuous_data(series=one_to_four, bins=[0, 2, 5])
assert two_bins_less_than_five.isna().sum() == 4
bins, samples_in_bins = bin_continuous_data(series=one_to_four, bins=[0, 2, 5])
assert (bins.unique() == ["0-2", "3+"]).all()
assert samples_in_bins.isna().all()

# Two bins, more than 5
one_to_ten = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
two_bins_more_than_five, _ = bin_continuous_data(series=one_to_ten, bins=[0, 5, 11])
assert len(two_bins_more_than_five.unique()) == 2
assert two_bins_more_than_five.isna().sum() == 0
one_to_ten = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 10])
bins, n_in_bins = bin_continuous_data(series=one_to_ten, bins=[0, 5, 11])
assert (bins.unique() == ["0-5", "6+"]).all()
assert bins.isna().sum() == 0

# Series is only NaNs
nans = pd.Series([np.nan, np.nan, np.nan])
Expand Down

0 comments on commit ee4ab7f

Please sign in to comment.