Tidying up and adding period arguments to config

ONSdigital · Jul 19, 2024 · 2a08df4 · 2a08df4
1 parent b43b97b
commit 2a08df4
Show file tree

Hide file tree

Showing 10 changed files with 186 additions and 22 deletions.
diff --git a/config.json b/config.json
@@ -17,5 +17,7 @@
                 "frozenemployees" : "int",
                 "frozenturnover" : "float",
                 "cellnumber" : "int"},
-    "temporarily_remove_cols": []
+    "temporarily_remove_cols": [],
+    "current_period" : 202401,
+    "previous_period" : 202312
 }
diff --git a/mbs_results/aggregate_aweights_by_class.py b/mbs_results/aggregate_aweights_by_class.py
@@ -3,7 +3,6 @@
 
 def aggregate_aweights_by_class(
     dataframe: pd.DataFrame,
-    reference: str,
     imp_class: str,
     period: str,
     a_weight: str,
@@ -17,8 +16,6 @@ def aggregate_aweights_by_class(
     ----------
     dataframe : pd.DataFrame
         Reference dataframe with imp_class and a_weights
-    reference : str
-        name of column in dataframe containing reference variable
     imp_class : str
         name of column in dataframe containing imp_class variable
     period : str
@@ -36,6 +33,6 @@ def aggregate_aweights_by_class(
     """
     current_df = dataframe[dataframe[period] == current_period]
 
-    aggregate_df = current_df[[imp_class, a_weight]].drop_duplicates()
+    aggregate_df = current_df[[period, imp_class, a_weight]].drop_duplicates()
 
     return aggregate_df.reset_index(drop=True)
diff --git a/mbs_results/selective_editing.py b/mbs_results/selective_editing.py
@@ -0,0 +1,88 @@
+import pandas as pd
+
+
+def create_standardising_factor(
+    dataframe: pd.DataFrame,
+    reference: str,
+    period: str,
+    domain: str,
+    question_code: str,
+    predicted_value: str,
+    imputation_marker: str,
+    a_weight: str,
+    o_weight: str,
+    g_weight: str,
+    auxiliary_value: str,
+    previous_period: int,
+) -> pd.DataFrame:
+    """
+    Returning standardising factor summed by domain for questions 40 and 49.
+    Standardising factor estimated using a_weights, o_weights and g_weights.
+
+    Parameters
+    ----------
+    dataframe : pd.DataFrame
+        Reference dataframe with domain, a_weights, o_weights, and g_weights
+    reference : str
+        name of column in dataframe containing reference variable
+    period : str
+        name of column in dataframe containing period variable
+    domain : str
+        name of column in dataframe containing domain variable
+    question_code : str
+        name of column in dataframe containing question code variable
+    predicted_value : str
+        name of column in dataframe containing predicted value variable
+    imputation_marker : str
+        name of column in dataframe containing imputation marker variable
+    a_weight : str
+        name of column in dataframe containing a_weight variable
+    o_weight : str
+        name of column in dataframe containing o_weight variable
+    g_weight : str
+        name of column in dataframe containing g_weight variable
+    auxiliary_value : str
+        name of column in dataframe containing auxiliary value variable
+    previous_period : int
+        previous period to take the weights for estimation of standardising factor in
+        the format yyyymm
+
+    Returns
+    -------
+    pd.DataFrame
+        dataframe with standardising factor estimated and summed by domain for
+        each reference.
+
+    """
+    previous_df = dataframe[(dataframe[period] == previous_period)]
+    previous_df = previous_df[previous_df[question_code].isin([40, 49])]
+
+    previous_df["standardising_factor"] = (
+        previous_df[predicted_value]
+        * previous_df[a_weight]
+        * previous_df[o_weight]
+        * previous_df[g_weight]
+    )
+
+    previous_df = previous_df.assign(
+        standardising_factor=lambda x: x.groupby([domain, question_code]).transform(
+            "sum"
+        )["standardising_factor"]
+    )
+    previous_df["standardising_factor"] = previous_df["standardising_factor"].astype(
+        float
+    )
+
+    output_df = previous_df[
+        [
+            period,
+            reference,
+            question_code,
+            "standardising_factor",
+            predicted_value,
+            imputation_marker,
+            auxiliary_value,
+        ]
+    ]
+
+    return output_df.reset_index(drop=True)
diff --git a/mbs_results/selective_editing_question.py b/mbs_results/selective_editing_question.py
@@ -1,7 +1,7 @@
 import pandas as pd
 
 
-def create_selective_editing_question(
+def create_standardising_factor(
     dataframe: pd.DataFrame,
     reference: str,
     period: str,
@@ -54,7 +54,7 @@ def create_selective_editing_question(
         each reference.
 
     """
-    previous_df = dataframe[dataframe[period] == previous_period]
+    previous_df = dataframe[(dataframe[period] == previous_period)]
     previous_df = previous_df[previous_df[question_code].isin([40, 49])]
 
     previous_df["standardising_factor"] = (

diff --git a/tests/data/estimation/aggregate_aweights_by_class_output.csv b/tests/data/estimation/aggregate_aweights_by_class_output.csv
@@ -1,3 +1,3 @@
-imp_class,a_weight
-2,2.0
-3,4.0
+period,imp_class,a_weight
+202402,2,2.0
+202402,3,4.0
diff --git a/tests/data/estimation/create_selective_editing_question_input.csv b/tests/data/estimation/create_selective_editing_question_input.csv
@@ -3,7 +3,7 @@ reference,period,domain,question_code,predicted_value,imputation_marker,a_weight
 10001,202402,0,40,604,R,5.0,1.0,2.0,39,
 10002,202312,6,49,233,FIR,6.0,1.0,1.0,15,
 10002,202401,6,49,432,FIR,1.0,3.0,2.0,45,2592.0
-10002,202402,9,49,,,2.0,2.0,2.0,73,
+10002,202402,6,49,,,2.0,2.0,2.0,73,
 10003,202401,9,49,150,C,4.0,5.0,1.0,21,3000.0
 10003,202402,9,49,837,R,3.0,7.0,4.0,74,
 10004,202401,15,20,111,FIC,1.0,4.0,6.0,11,
@@ -12,3 +12,4 @@ reference,period,domain,question_code,predicted_value,imputation_marker,a_weight
 10006,202401,19,40,336,C,2.0,1.0,4.0,14,2688.0
 10006,202401,19,49,461,C,3.0,1.0,4.0,58,9084.0
 10007,202401,19,49,222,BIR,4.0,2.0,2.0,67,9084.0
+10008,202401,17,40,,,4.0,1.0,5.0,48,12012.0
diff --git a/tests/data/estimation/create_standardising_factor_data.csv b/tests/data/estimation/create_standardising_factor_data.csv
@@ -0,0 +1,15 @@
+reference,period,domain,question_code,predicted_value,imputation_marker,a_weight,o_weight,g_weight,auxiliary_value,standardising_factor
+10001,202401,0,40,589,R,2.0,4.0,1.0,23,4712.0
+10001,202402,0,40,604,R,5.0,1.0,2.0,39,
+10002,202312,6,49,233,FIR,6.0,1.0,1.0,15,
+10002,202401,6,49,432,FIR,1.0,3.0,2.0,45,2592.0
+10002,202402,6,49,,,2.0,2.0,2.0,73,
+10003,202401,9,49,150,C,4.0,5.0,1.0,21,3000.0
+10003,202402,9,49,837,R,3.0,7.0,4.0,74,
+10004,202401,15,20,111,FIC,1.0,4.0,6.0,11,
+10005,202401,17,40,1001,FIR,2.0,2.0,3.0,95,12012.0
+10005,202402,19,40,532,FIR,7.0,3.0,3.0,29,
+10006,202401,19,40,336,C,2.0,1.0,4.0,14,2688.0
+10006,202401,19,49,461,C,3.0,1.0,4.0,58,9084.0
+10007,202401,19,49,222,BIR,4.0,2.0,2.0,67,9084.0
+10008,202401,17,40,,,4.0,1.0,5.0,48,12012.0
diff --git a/tests/test_aggregate_aweights_by_class.py b/tests/test_aggregate_aweights_by_class.py
@@ -36,7 +36,6 @@ def test_aggregate_aweights_by_class(
 
         actual_output = aggregate_aweights_by_class(
             input_data,
-            "reference",
             "imp_class",
             "period",
             "a_weight",

diff --git a/tests/test_create_selective_editing_question.py b/tests/test_create_selective_editing_question.py
@@ -4,7 +4,7 @@
 import pytest
 from pandas.testing import assert_frame_equal
 
-from mbs_results.selective_editing_question import create_selective_editing_question
+from mbs_results.selective_editing import create_standardising_factor
 
 
 @pytest.fixture(scope="class")
@@ -13,19 +13,19 @@ def filepath():
 
 
 @pytest.fixture(scope="class")
-def create_selective_editing_question_input(filepath):
+def create_standardising_factor_data(filepath):
     return pd.read_csv(
-        filepath / "create_selective_editing_question_input.csv", index_col=False
+        filepath / "create_standardising_factor_data.csv", index_col=False
     )
 
 
-class TestCreateSelectiveEditingQuestion:
-    def test_create_selective_editing_question(
+class TestCreateStandardisingFactor:
+    def test_create_standardising_factor(
         self,
-        create_selective_editing_question_input,
+        create_standardising_factor_data,
     ):
-        expected_output = create_selective_editing_question_input[
-            create_selective_editing_question_input["standardising_factor"].notna()
+        expected_output = create_standardising_factor_data[
+            create_standardising_factor_data["standardising_factor"].notna()
         ]
         expected_output = expected_output[
             [
@@ -40,11 +40,11 @@ def test_create_selective_editing_question(
         ]
         expected_output = expected_output.reset_index(drop=True)
 
-        input_data = create_selective_editing_question_input.drop(
+        input_data = create_standardising_factor_data.drop(
             columns="standardising_factor"
         )
 
-        actual_output = create_selective_editing_question(
+        actual_output = create_standardising_factor(
             input_data,
             "reference",
             "period",

diff --git a/tests/test_create_standardising_factor.py b/tests/test_create_standardising_factor.py
@@ -0,0 +1,62 @@
+from pathlib import Path
+
+import pandas as pd
+import pytest
+from pandas.testing import assert_frame_equal
+
+from mbs_results.selective_editing import create_standardising_factor
+
+
+@pytest.fixture(scope="class")
+def filepath():
+    return Path("tests/data/estimation")
+
+
+@pytest.fixture(scope="class")
+def create_standardising_factor_data(filepath):
+    return pd.read_csv(
+        filepath / "create_standardising_factor_data.csv", index_col=False
+    )
+
+
+class TestCreateStandardisingFactor:
+    def test_create_standardising_factor(
+        self,
+        create_standardising_factor_data,
+    ):
+        expected_output = create_standardising_factor_data[
+            create_standardising_factor_data["standardising_factor"].notna()
+        ]
+        expected_output = expected_output[
+            [
+                "period",
+                "reference",
+                "question_code",
+                "standardising_factor",
+                "predicted_value",
+                "imputation_marker",
+                "auxiliary_value",
+            ]
+        ]
+        expected_output = expected_output.reset_index(drop=True)
+
+        input_data = create_standardising_factor_data.drop(
+            columns="standardising_factor"
+        )
+
+        actual_output = create_standardising_factor(
+            input_data,
+            "reference",
+            "period",
+            "domain",
+            "question_code",
+            "predicted_value",
+            "imputation_marker",
+            "a_weight",
+            "o_weight",
+            "g_weight",
+            "auxiliary_value",
+            202401,
+        )
+
+        assert_frame_equal(actual_output, expected_output)