ONSdigital · Jday7879 · Aug 9, 2024 · Aug 5, 2024 · Aug 7, 2024 · Aug 8, 2024
diff --git a/mbs_results/constrains.py b/mbs_results/constrains.py
@@ -82,7 +82,6 @@ def sum_sub_df(df: pd.DataFrame, derive_from: List[int]) -> pd.DataFrame:
         A dataframe with sums, constain marker, and columns from index which the
         sum was based on.
     """
-
     sums = sum(
         [df.loc[question_no] for question_no in derive_from if question_no in df.index]
     )
@@ -136,12 +135,7 @@ def constrain(
         Original dataframe with constrains.
     """
 
-    derive_map = {
-        13: {"derive": 40, "from": [46, 47]},
-        14: {"derive": 40, "from": [42, 43]},
-        15: {"derive": 46, "from": [40]},
-        16: {"derive": 42, "from": [40]},
-    }
+    derive_map = create_derive_map(df, spp_form_id)
 
     # pre_derive_df has dimenesions as index, columns the values to be used when derived
     pre_derive_df = df.set_index(
@@ -160,11 +154,220 @@ def constrain(
 
     df.set_index([question_no, period, reference], inplace=True)
 
-    replace_values_index_based(df, target_imputed, 49, ">", 40)
-    replace_values_index_based(df, target_imputed, 90, ">=", 40)
+    if 49 in df[question_no].unique():
+        replace_values_index_based(df, target_imputed, 49, ">", 40)
+    elif 90 in df[question_no].unique():
+        replace_values_index_based(df, target_imputed, 90, ">=", 40)
 
     df.reset_index(inplace=True)
 
     final_constrained = pd.concat([df, derived_values])
 
     return final_constrained
+
+
+def derive_questions(
+    df: pd.DataFrame,
+    period: str,
+    reference: str,
+    target: str,
+    question_no: str,
+    spp_form_id: str,
+) -> pd.DataFrame:
+    """
+    Function to calculate new o-weights post winsorisation
+    This has same functionality has constraints, but does not use
+    two target variable columns, refactoring could be done to combine
+    further down the line
+    ASSUMES DEFAULT O-WEIGHT IS 1
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe, can be with or without constrain_marker column
+        Function will create this if not previously existing
+    period : str
+        Column name containing date information.
+    reference : str
+        Column name containing reference.
+    target : str
+        Column name containing target value.
+    question_no : str
+        Column name containing question number.
+    spp_form_id : str
+        Column name containing form id.
+
+    Returns
+    -------
+    pd.DataFrame
+        Original dataframe with constrains.
+    """
+    derive_map = create_derive_map(df, spp_form_id)
+    pre_derive_df = df.set_index(
+        [spp_form_id, question_no, period, reference], verify_integrity=False
+    )
+    # Assuming default value of o-weight is 1
+    pre_derive_df = pre_derive_df[[target]].fillna(value=0)
+
+    derived_values = pd.concat(
+        [
+            sum_sub_df(pre_derive_df.loc[form_type], derives["from"])
+            .assign(question_no=derives["derive"])
+            .assign(spp_form_id=form_type)
+            # Create a task on Backlog to fix this.
+            for form_type, derives in derive_map.items()
+        ]
+    )
+    unique_q_numbers = df.question_no.unique()
+
+    df.set_index([question_no, period, reference], inplace=True)
+
+    # This would replace 49 with 40, but might have been winsorised independently
+    if [40, 49] in unique_q_numbers:
+        replace_values_index_based(df, target, 49, ">", 40)
+    elif [90, 40] in unique_q_numbers:
+        replace_values_index_based(df, target, 90, ">=", 40)
+    df.reset_index(inplace=True)
+
+    final_constrained = pd.concat([df, derived_values]).reset_index(drop=True)
+    # final_constrained.rename(columns ={"spp_form_id": spp_form_id},inplace=True)
+    return final_constrained
+
+
+def create_derive_map(df: pd.DataFrame, spp_form_id: str):
+    """
+    Function to create derive mapping dictionary
+    Will check the unique values for form types and remove this
+    from the dictionary if not present. handles error
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe
+    spp_form_id : str
+        Column name containing form id.
+
+    Returns
+    -------
+    dict
+        Derived question mapping in a dictionary.
+        Removes form IDs which are not present in dataframe
+    """
+
+    derive_map = {
+        13: {"derive": 40, "from": [46, 47]},
+        14: {"derive": 40, "from": [42, 43]},
+        15: {"derive": 46, "from": [40]},
+        16: {"derive": 42, "from": [40]},
+    }
+    form_ids_present = df[spp_form_id].dropna().unique()
+    ids_not_present = [x for x in derive_map.keys() if x not in form_ids_present]
+    for key in ids_not_present:
+        derive_map.pop(key)
+
+    return derive_map
+
+
+def calculate_derived_outlier_weights(
+    df: pd.DataFrame,
+    period: str,
+    reference: str,
+    target: str,
+    question_no: str,
+    spp_form_id: str,
+    outlier_weight: str,
+    winsorised_target: str,
+) -> pd.DataFrame:
+    """
+    Function to calculate new outlier weights for derived questions
+    post winsorisation. This function can work if constrain_marker is not
+    already present and will derive this from `target_variable`
+    This will be skipped if column already exists.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe, can be with or without constrain_marker column
+        Function will create this if not previously existing
+    period : str
+        Column name containing date information.
+    reference : str
+        Column name containing reference.
+    target : str
+        Column name containing target value.
+    question_no : str
+        Column name containing question number.
+    spp_form_id : str
+        Column name containing form id.
+    outlier_weight : str
+        column name containing outlier weights from winsorisation
+    winsorised_target : str
+        column name for winsorised target variable
+
+    -------
+    pd.DataFrame
+        Original dataframe with updated outlier weights calculated for
+        derived questions. Bool column is also added to identify which
+        rows have been modified during this function
+    """
+    default_o_weight_bool = df[winsorised_target].isna()
+    df["default_o_weight"] = default_o_weight_bool
+
+    # Assuming default value of o-weight is 1
+    df.loc[default_o_weight_bool, winsorised_target] = df.loc[
+        default_o_weight_bool, target
+    ]
+
+    if "constrain_marker" not in df.columns:
+        # Handling case where derived Q's not been
+        # calculated pre-winsorisation
+        df_pre_winsorised = derive_questions(
+            df,
+            period,
+            reference,
+            target,
+            question_no,
+            spp_form_id,
+        )
+    else:
+        # Skipping calculating derived Q's
+        df_pre_winsorised = df.copy()
+
+    post_win_derived = derive_questions(
+        df,
+        period,
+        reference,
+        winsorised_target,
+        question_no,
+        spp_form_id,
+    )
+
+    post_win_derived = post_win_derived.loc[
+        post_win_derived["constrain_marker"].notna()
+    ]
+    post_win_derived = post_win_derived[
+        [period, reference, question_no, spp_form_id, winsorised_target]
+    ]
+
+    df_pre_winsorised.set_index(
+        [spp_form_id, question_no, period, reference], inplace=True
+    )
+    post_win_derived.set_index(
+        [spp_form_id, question_no, period, reference], inplace=True
+    )
+
+    updated_o_weight_bool = df_pre_winsorised[winsorised_target].isna()
+    df_pre_winsorised.loc[
+        updated_o_weight_bool, winsorised_target
+    ] = post_win_derived.loc[updated_o_weight_bool, winsorised_target]
+    df_pre_winsorised["post_wins_marker"] = updated_o_weight_bool
+
+    df_pre_winsorised.reset_index(inplace=True)
+    df_pre_winsorised.loc[
+        df_pre_winsorised["constrain_marker"].notna(), outlier_weight
+    ] = (df_pre_winsorised[winsorised_target] / df_pre_winsorised[target])
+    df_pre_winsorised.sort_values(
+        by=[reference, period, question_no, spp_form_id], inplace=True
+    )
+
+    return df_pre_winsorised
diff --git a/tests/data/winsorisation/derived-questions-winsor-missing.csv b/tests/data/winsorisation/derived-questions-winsor-missing.csv
@@ -0,0 +1,6 @@
+reference,period,aux,sampled,target_variable,new_target_variable,outlier_weight,question_no,spp_form_id,default_o_weight,constrain_marker,post_wins_marker
+101,202401,10,0,12,12,,46,13,True,,False
+101,202401,23,1,20,20,1,47,13,False,,False
+101,202401,41,1,20,20,1,48,13,False,,False
+101,202401,53,1,40,40,1,42,13,False,,False
+101,202401,,,32,32,1,40,13,,"sum[46, 47]",True
diff --git a/tests/data/winsorisation/derived-questions-winsor.csv b/tests/data/winsorisation/derived-questions-winsor.csv
@@ -0,0 +1,24 @@
+reference,period,aux,sampled,target_variable,new_target_variable,outlier_weight,question_no,spp_form_id,default_o_weight,constrain_marker,post_wins_marker
+101,202401,10,0,12,12,1,46,13,False,,False
+101,202401,23,1,20,20,1,47,13,False,,False
+101,202401,41,1,20,20,1,48,13,False,,False
+101,202401,53,1,40,40,1,42,13,False,,False
+101,202401,,,32,32,1,40,13,,"sum[46, 47]",True
+
+102,202401,12,0,10,10,1,43,13,False,,False
+102,202401,50,1,60,56,0.9,46,13,False,,False
+102,202401,40,1,40,34,0.9,47,13,False,,False
+102,202401,40,1,40,34,0.9,42,13,False,,False
+102,202401,,,100,90,0.9,40,13,,"sum[46, 47]",True
+
+103,202401,12,0,100,100,1,42,14,False,,False
+103,202401,50,1,800,640,0.8,43,14,False,,False
+103,202401,,,900,740,0.82222,40,14,,"sum[42, 43]",True
+
+104,202402,2,0,400,360,0.9,42,14,False,,False
+104,202402,3,1,800,640,0.8,43,14,False,,False
+104,202402,,,1200,1000,0.83333,40,14,,"sum[42, 43]",True
+
+105,202402,2,0,100,100,1,42,14,False,,False
+105,202402,3,1,200,200,1,43,14,False,,False
+105,202402,,,300,300,1,40,14,,"sum[42, 43]",True
diff --git a/tests/test_constrains.py b/tests/test_constrains.py
@@ -1,7 +1,11 @@
 import pandas as pd
 from pandas.testing import assert_frame_equal
 
-from mbs_results.constrains import replace_values_index_based, sum_sub_df
+from mbs_results.constrains import (
+    calculate_derived_outlier_weights,
+    replace_values_index_based,
+    sum_sub_df,
+)
 
 
 class TestConstrains:
@@ -46,3 +50,80 @@ def test_sum_sub_df_46_47(self):
         )
 
         assert_frame_equal(actual_ouput, expected_output)
+
+    def test_calculate_derived_outlier_weights(self):
+        df = pd.read_csv(
+            "tests/data/winsorisation/derived-questions-winsor.csv",
+            index_col=False,
+        )
+        df["target_variable"] = df["target_variable"].astype(float)
+        df["new_target_variable"] = df["new_target_variable"].astype(float)
+        # Drop q40 rows
+        df_input = df.drop(df[df["question_no"] == 40].index)
+        df_input.drop(
+            columns=["default_o_weight", "constrain_marker", "post_wins_marker"],
+            inplace=True,
+        )
+
+        df_output = calculate_derived_outlier_weights(
+            df_input,
+            "period",
+            "reference",
+            "target_variable",
+            "question_no",
+            "spp_form_id",
+            "outlier_weight",
+            "new_target_variable",
+        )
+
+        sorting_by = ["reference", "period", "question_no", "spp_form_id"]
+        input_col_order = df.columns
+        df_output = (
+            df_output[input_col_order].sort_values(by=sorting_by).reset_index(drop=True)
+        )
+        df = df.sort_values(by=sorting_by).reset_index(drop=True)
+
+        assert_frame_equal(df, df_output)
+
+    def test_calculate_derived_outlier_weights_missing(self):
+        df = pd.read_csv(
+            "tests/data/winsorisation/derived-questions-winsor-missing.csv",
+            index_col=False,
+        )
+        df["target_variable"] = df["target_variable"].astype(float)
+        df["new_target_variable"] = df["new_target_variable"].astype(float)
+        # Drop q40 rows
+        df_input = df.drop(df[df["question_no"] == 40].index)
+        df_input.drop(
+            columns=["default_o_weight", "constrain_marker", "post_wins_marker"],
+            inplace=True,
+        )
+        # Manually change the input data to be missing one value in
+        # new_target_variable . data is present in dataset to compare against
+        df_input.loc[
+            (df_input["reference"] == 101)
+            & (df_input["period"] == 202401)
+            & (df_input["question_no"] == 46)
+            & (df_input["spp_form_id"] == 13),
+            "new_target_variable",
+        ] = None
+
+        df_output = calculate_derived_outlier_weights(
+            df_input,
+            "period",
+            "reference",
+            "target_variable",
+            "question_no",
+            "spp_form_id",
+            "outlier_weight",
+            "new_target_variable",
+        )
+
+        sorting_by = ["reference", "period", "question_no", "spp_form_id"]
+        input_col_order = df.columns
+        df_output = (
+            df_output[input_col_order].sort_values(by=sorting_by).reset_index(drop=True)
+        )
+        df = df.sort_values(by=sorting_by).reset_index(drop=True)
+
+        assert_frame_equal(df, df_output)