Skip to content

Commit

Permalink
refactoring and extracting to new lower level function
Browse files Browse the repository at this point in the history
  • Loading branch information
Jday7879 committed Jul 16, 2024
1 parent 99564c6 commit 96f8255
Showing 1 changed file with 69 additions and 13 deletions.
82 changes: 69 additions & 13 deletions mbs_results/calculate_imputation_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,6 @@ def calculate_imputation_link(

df_intermediate = df.copy()

# Need this for edge case when denominator is zero
# But is from a valid zero return
df_intermediate["true_zeros_column"] = df_intermediate[predictive_variable] == 0
grouped_true_zeros = df_intermediate.groupby([strata, period])[
"true_zeros_column"
].transform("sum")
grouped_true_zeros = grouped_true_zeros != 0
df_intermediate.drop(columns=["true_zeros_column"])

df_intermediate[target] = df_intermediate[target] * df_intermediate[match_col]

df_intermediate[predictive_variable] = (
Expand All @@ -67,11 +58,76 @@ def calculate_imputation_link(
predictive_variable
].transform("sum")

df["denominator"] = denominator
denominator_col = "denominator"

df[denominator_col] = denominator
denominator.replace(0, np.nan, inplace=True) # cover division with 0

df[link_col] = numerator / denominator

df = calculate_default_imputation_links(
df, period, strata, match_col, predictive_variable, link_col, denominator_col
)

return df


def calculate_default_imputation_links(
df: pd.DataFrame,
period: str,
strata: str,
match_col: str,
predictive_variable: str,
link_col: str,
denominator: str,
) -> pd.DataFrame:
"""
Calculates and replaces links to default values when either,
denominator is zero or the link cannot be calculated.
Both cases link is replaced with 1
Matched pair counts is replaced with either 0 (denominator zero)
or null (link cannot be calculated)
Parameters
----------
df : pd.Dataframe
Original dataframe.
period : str
Column name containing time period.
strata : str
Column name containing strata information (sic).
match_col : str
Column name of the matched pair links, this column should be bool.
predictive_variable : str
Column name of the predicted target variable.
link_col : str
Name to use for the new column containing imputation link
denominator : str
Name used for the column containing the denominators of the link
calculation
Returns
-------
df : pd.DataFrame
A pandas DataFrame with default values overwriting values in
imputation link columns.
"""

df_intermediate = df.copy()

# Need this for edge case when denominator is zero
# But is from a valid zero return. i.e. retuned zero and
# not a filled zero from empty or Null
df_intermediate["return_in_predictive_is_zero"] = (
df_intermediate[predictive_variable] == 0
)
grouped_true_zeros = df_intermediate.groupby([strata, period])[
"return_in_predictive_is_zero"
].transform("sum")
grouped_true_zeros = grouped_true_zeros != 0
df_intermediate.drop(columns=["return_in_predictive_is_zero"])

# Re adding count matches column as this is needed for default cases
# This count is just the filtered target counts if issues come up.
# (If there is a filter applied to this data)
Expand All @@ -83,9 +139,9 @@ def calculate_imputation_link(
)
# Creating two logical masks for cases when denominator is zero and
# link cannot be calculated
mask_denominator_zero = df["denominator"] == 0
mask_denominator_zero = df[denominator] == 0
mask_cannot_calculate = ((df[link_col].isna()) | (np.isinf(df[link_col]))) & (
df["denominator"] != 0
df[denominator] != 0
)
# Default link is always 1:
df.loc[(mask_cannot_calculate | mask_denominator_zero), link_col] = 1
Expand All @@ -99,5 +155,5 @@ def calculate_imputation_link(
df["default_link_" + match_col] = np.where(
(mask_cannot_calculate | mask_denominator_zero), True, False
)
df.drop(columns=["denominator"], inplace=True)
df.drop(columns=[denominator], inplace=True)
return df

0 comments on commit 96f8255

Please sign in to comment.