Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

479 o weight constraints #70

Merged
merged 13 commits into from
Aug 9, 2024
221 changes: 212 additions & 9 deletions mbs_results/constrains.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def sum_sub_df(df: pd.DataFrame, derive_from: List[int]) -> pd.DataFrame:
A dataframe with sums, constain marker, and columns from index which the
sum was based on.
"""

sums = sum(
[df.loc[question_no] for question_no in derive_from if question_no in df.index]
)
Expand Down Expand Up @@ -136,12 +135,7 @@ def constrain(
Original dataframe with constrains.
"""

derive_map = {
13: {"derive": 40, "from": [46, 47]},
14: {"derive": 40, "from": [42, 43]},
15: {"derive": 46, "from": [40]},
16: {"derive": 42, "from": [40]},
}
derive_map = create_derive_map(df, spp_form_id)

# pre_derive_df has dimenesions as index, columns the values to be used when derived
pre_derive_df = df.set_index(
Expand All @@ -160,11 +154,220 @@ def constrain(

df.set_index([question_no, period, reference], inplace=True)

replace_values_index_based(df, target_imputed, 49, ">", 40)
replace_values_index_based(df, target_imputed, 90, ">=", 40)
if 49 in df[question_no].unique():
replace_values_index_based(df, target_imputed, 49, ">", 40)
elif 90 in df[question_no].unique():
replace_values_index_based(df, target_imputed, 90, ">=", 40)

df.reset_index(inplace=True)

final_constrained = pd.concat([df, derived_values])

return final_constrained


def derive_questions(
df: pd.DataFrame,
period: str,
reference: str,
target: str,
question_no: str,
spp_form_id: str,
) -> pd.DataFrame:
"""
Function to calculate new o-weights post winsorisation
This has same functionality has constraints, but does not use
two target variable columns, refactoring could be done to combine
further down the line
ASSUMES DEFAULT O-WEIGHT IS 1

Parameters
----------
df : pd.DataFrame
Original dataframe, can be with or without constrain_marker column
Function will create this if not previously existing
period : str
Column name containing date information.
reference : str
Column name containing reference.
target : str
Column name containing target value.
question_no : str
Column name containing question number.
spp_form_id : str
Column name containing form id.

Returns
-------
pd.DataFrame
Original dataframe with constrains.
"""
derive_map = create_derive_map(df, spp_form_id)
pre_derive_df = df.set_index(
[spp_form_id, question_no, period, reference], verify_integrity=False
)
# Assuming default value of o-weight is 1
pre_derive_df = pre_derive_df[[target]].fillna(value=0)

derived_values = pd.concat(
[
sum_sub_df(pre_derive_df.loc[form_type], derives["from"])
.assign(question_no=derives["derive"])
.assign(spp_form_id=form_type)
# Create a task on Backlog to fix this.
for form_type, derives in derive_map.items()
]
)
unique_q_numbers = df.question_no.unique()

df.set_index([question_no, period, reference], inplace=True)

# This would replace 49 with 40, but might have been winsorised independently
if [40, 49] in unique_q_numbers:
replace_values_index_based(df, target, 49, ">", 40)
elif [90, 40] in unique_q_numbers:
replace_values_index_based(df, target, 90, ">=", 40)
df.reset_index(inplace=True)

final_constrained = pd.concat([df, derived_values]).reset_index(drop=True)
# final_constrained.rename(columns ={"spp_form_id": spp_form_id},inplace=True)
return final_constrained


def create_derive_map(df: pd.DataFrame, spp_form_id: str):
"""
Function to create derive mapping dictionary
Will check the unique values for form types and remove this
from the dictionary if not present. handles error

Parameters
----------
df : pd.DataFrame
Original dataframe
spp_form_id : str
Column name containing form id.

Returns
-------
dict
Derived question mapping in a dictionary.
Removes form IDs which are not present in dataframe
"""

derive_map = {
13: {"derive": 40, "from": [46, 47]},
14: {"derive": 40, "from": [42, 43]},
15: {"derive": 46, "from": [40]},
16: {"derive": 42, "from": [40]},
}
form_ids_present = df[spp_form_id].dropna().unique()
ids_not_present = [x for x in derive_map.keys() if x not in form_ids_present]
for key in ids_not_present:
derive_map.pop(key)

return derive_map


def calculate_derived_outlier_weights(
df: pd.DataFrame,
period: str,
reference: str,
target: str,
question_no: str,
spp_form_id: str,
outlier_weight: str,
winsorised_target: str,
) -> pd.DataFrame:
"""
Function to calculate new outlier weights for derived questions
post winsorisation. This function can work if constrain_marker is not
already present and will derive this from `target_variable`
This will be skipped if column already exists.

Parameters
----------
df : pd.DataFrame
Original dataframe, can be with or without constrain_marker column
Function will create this if not previously existing
period : str
Column name containing date information.
reference : str
Column name containing reference.
target : str
Column name containing target value.
question_no : str
Column name containing question number.
spp_form_id : str
Column name containing form id.
outlier_weight : str
column name containing outlier weights from winsorisation
winsorised_target : str
column name for winsorised target variable

-------
pd.DataFrame
Original dataframe with updated outlier weights calculated for
derived questions. Bool column is also added to identify which
rows have been modified during this function
"""
default_o_weight_bool = df[winsorised_target].isna()
df["default_o_weight"] = default_o_weight_bool

# Assuming default value of o-weight is 1
df.loc[default_o_weight_bool, winsorised_target] = df.loc[
default_o_weight_bool, target
]

if "constrain_marker" not in df.columns:
# Handling case where derived Q's not been
# calculated pre-winsorisation
df_pre_winsorised = derive_questions(
df,
period,
reference,
target,
question_no,
spp_form_id,
)
else:
# Skipping calculating derived Q's
df_pre_winsorised = df.copy()

post_win_derived = derive_questions(
df,
period,
reference,
winsorised_target,
question_no,
spp_form_id,
)

post_win_derived = post_win_derived.loc[
post_win_derived["constrain_marker"].notna()
]
post_win_derived = post_win_derived[
[period, reference, question_no, spp_form_id, winsorised_target]
]

df_pre_winsorised.set_index(
[spp_form_id, question_no, period, reference], inplace=True
)
post_win_derived.set_index(
[spp_form_id, question_no, period, reference], inplace=True
)

updated_o_weight_bool = df_pre_winsorised[winsorised_target].isna()
df_pre_winsorised.loc[
updated_o_weight_bool, winsorised_target
] = post_win_derived.loc[updated_o_weight_bool, winsorised_target]
df_pre_winsorised["post_wins_marker"] = updated_o_weight_bool

df_pre_winsorised.reset_index(inplace=True)
df_pre_winsorised.loc[
df_pre_winsorised["constrain_marker"].notna(), outlier_weight
] = (df_pre_winsorised[winsorised_target] / df_pre_winsorised[target])
df_pre_winsorised.sort_values(
by=[reference, period, question_no, spp_form_id], inplace=True
)

return df_pre_winsorised
6 changes: 6 additions & 0 deletions tests/data/winsorisation/derived-questions-winsor-missing.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
reference,period,aux,sampled,target_variable,new_target_variable,outlier_weight,question_no,spp_form_id,default_o_weight,constrain_marker,post_wins_marker
101,202401,10,0,12,12,,46,13,True,,False
101,202401,23,1,20,20,1,47,13,False,,False
101,202401,41,1,20,20,1,48,13,False,,False
101,202401,53,1,40,40,1,42,13,False,,False
101,202401,,,32,32,1,40,13,,"sum[46, 47]",True
24 changes: 24 additions & 0 deletions tests/data/winsorisation/derived-questions-winsor.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
reference,period,aux,sampled,target_variable,new_target_variable,outlier_weight,question_no,spp_form_id,default_o_weight,constrain_marker,post_wins_marker
101,202401,10,0,12,12,1,46,13,False,,False
101,202401,23,1,20,20,1,47,13,False,,False
101,202401,41,1,20,20,1,48,13,False,,False
101,202401,53,1,40,40,1,42,13,False,,False
101,202401,,,32,32,1,40,13,,"sum[46, 47]",True

102,202401,12,0,10,10,1,43,13,False,,False
102,202401,50,1,60,56,0.9,46,13,False,,False
102,202401,40,1,40,34,0.9,47,13,False,,False
102,202401,40,1,40,34,0.9,42,13,False,,False
102,202401,,,100,90,0.9,40,13,,"sum[46, 47]",True

103,202401,12,0,100,100,1,42,14,False,,False
103,202401,50,1,800,640,0.8,43,14,False,,False
103,202401,,,900,740,0.82222,40,14,,"sum[42, 43]",True

104,202402,2,0,400,360,0.9,42,14,False,,False
104,202402,3,1,800,640,0.8,43,14,False,,False
104,202402,,,1200,1000,0.83333,40,14,,"sum[42, 43]",True

105,202402,2,0,100,100,1,42,14,False,,False
105,202402,3,1,200,200,1,43,14,False,,False
105,202402,,,300,300,1,40,14,,"sum[42, 43]",True
83 changes: 82 additions & 1 deletion tests/test_constrains.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import pandas as pd
from pandas.testing import assert_frame_equal

from mbs_results.constrains import replace_values_index_based, sum_sub_df
from mbs_results.constrains import (
calculate_derived_outlier_weights,
replace_values_index_based,
sum_sub_df,
)


class TestConstrains:
Expand Down Expand Up @@ -46,3 +50,80 @@ def test_sum_sub_df_46_47(self):
)

assert_frame_equal(actual_ouput, expected_output)

def test_calculate_derived_outlier_weights(self):
df = pd.read_csv(
"tests/data/winsorisation/derived-questions-winsor.csv",
index_col=False,
)
df["target_variable"] = df["target_variable"].astype(float)
df["new_target_variable"] = df["new_target_variable"].astype(float)
# Drop q40 rows
df_input = df.drop(df[df["question_no"] == 40].index)
df_input.drop(
columns=["default_o_weight", "constrain_marker", "post_wins_marker"],
inplace=True,
)

df_output = calculate_derived_outlier_weights(
df_input,
"period",
"reference",
"target_variable",
"question_no",
"spp_form_id",
"outlier_weight",
"new_target_variable",
)

sorting_by = ["reference", "period", "question_no", "spp_form_id"]
input_col_order = df.columns
df_output = (
df_output[input_col_order].sort_values(by=sorting_by).reset_index(drop=True)
)
df = df.sort_values(by=sorting_by).reset_index(drop=True)

assert_frame_equal(df, df_output)

def test_calculate_derived_outlier_weights_missing(self):
df = pd.read_csv(
"tests/data/winsorisation/derived-questions-winsor-missing.csv",
index_col=False,
)
df["target_variable"] = df["target_variable"].astype(float)
df["new_target_variable"] = df["new_target_variable"].astype(float)
# Drop q40 rows
df_input = df.drop(df[df["question_no"] == 40].index)
df_input.drop(
columns=["default_o_weight", "constrain_marker", "post_wins_marker"],
inplace=True,
)
# Manually change the input data to be missing one value in
# new_target_variable . data is present in dataset to compare against
df_input.loc[
(df_input["reference"] == 101)
& (df_input["period"] == 202401)
& (df_input["question_no"] == 46)
& (df_input["spp_form_id"] == 13),
"new_target_variable",
] = None

df_output = calculate_derived_outlier_weights(
df_input,
"period",
"reference",
"target_variable",
"question_no",
"spp_form_id",
"outlier_weight",
"new_target_variable",
)

sorting_by = ["reference", "period", "question_no", "spp_form_id"]
input_col_order = df.columns
df_output = (
df_output[input_col_order].sort_values(by=sorting_by).reset_index(drop=True)
)
df = df.sort_values(by=sorting_by).reset_index(drop=True)

assert_frame_equal(df, df_output)
Loading