Merge pull request #63 from ONSdigital/465-imputation-links

465 imputation links
ONSdigital · Aug 19, 2024 · 60efe2a · 60efe2a
2 parents 36b8fd4 + 1badbdf
commit 60efe2a
Show file tree

Hide file tree

Showing 5 changed files with 331 additions and 0 deletions.
diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py
@@ -0,0 +1,145 @@
+import pandas as pd
+
+
+def merge_counts(
+    input_df: pd.DataFrame,
+    count_df: pd.DataFrame,
+    input_cell: str,
+    count_cell: str,
+    input_date: str,
+    count_date: str,
+    identifier: str,
+) -> pd.DataFrame:
+    """
+    Returns input data with f_count and b_count merged on.
+
+    Parameters
+    ----------
+    input_df : pd.DataFrame
+        Reference dataframe with identifier, date, sic, cell, forward, backward,
+        construction, question, imputed_value
+    count_df : pd.DataFrame
+        DataFrame with group, period, f_count and b_count
+    input_cell : str
+        name of column in input_df dataframe containing cell variable
+    count_cell : str
+        name of column in count_df dataframe containing cell variable
+    input_date : str
+        name of column in input_df dataframe containing date variable
+    count_date : str
+        name of column in count_df dataframe containing date variable
+    identifier : str
+        name of column in input_df containing identifier variable
+
+    Returns
+    -------
+    Dataframe resulting from the left-join of input_df and count_df on the cell and
+    date columns.
+    """
+    df_merge = pd.merge(
+        input_df,
+        count_df,
+        how="left",
+        left_on=[input_cell, input_date],
+        right_on=[count_cell, count_date],
+    ).astype({identifier: "int"})
+
+    return df_merge.drop(columns=[count_cell, count_date])
+
+
+def pivot_imputation_value(
+    df: pd.DataFrame,
+    identifier: str,
+    groups: list,
+    link_columns: list,
+    count_columns: list,
+    imputed_value: str,
+    selected_periods: list = None,
+) -> pd.DataFrame:
+
+    """
+    Returning dataframe containing imputation_value, filtered by date, pivoted by
+    imputation type and grouped by sic, cell, question and imputation type.
+
+    Parameters
+    ----------
+    dataframe : pd.DataFrame
+        Reference dataframe containing links, count values, and imputed values
+        by identifier, cell, date, and question
+    identifier : str
+        name of column in dataframe containing identifier variable
+    groups : list
+
+    link_columns : list
+
+    count_columns : list
+
+    imputed_value: str
+        name of column in dataframe containing imputed_value variable
+    selected_periods: list
+        list containing periods to include in output
+
+    Returns
+    -------
+    dataframe filtered by date, containing imputation_value, pivoted by imputation type
+    and grouped by sic, cell, question and imputation type.
+
+    """
+    if selected_periods is not None:
+        df = df.query("{} in {}".format(groups[0], selected_periods))
+
+    links_df = df.melt(
+        id_vars=groups + [imputed_value],
+        value_vars=link_columns,
+        var_name="link_type",
+        value_name="imputation_link",
+    )
+
+    link_type_map = dict(zip(link_columns, ["F", "B", "C"]))
+    links_df["link_type"] = links_df["link_type"].map(link_type_map)
+
+    counts_df = df.melt(
+        id_vars=groups,
+        value_vars=count_columns,
+        var_name="link_type_count",
+        value_name="count",
+    )
+
+    link_type_map_count = dict(zip(count_columns, ["F", "B", "C"]))
+    counts_df["link_type_count"] = counts_df["link_type_count"].map(link_type_map_count)
+
+    merged_df = pd.merge(
+        links_df,
+        counts_df,
+        how="outer",
+        left_on=groups + ["link_type"],
+        right_on=groups + ["link_type_count"],
+    )
+
+    merged_df.drop_duplicates(inplace=True)
+    merged_df.drop(["link_type_count"], axis=1, inplace=True)
+
+    merged_df = merged_df.groupby(groups + ["link_type"], as_index=False).agg(
+        {imputed_value: "sum", "count": "first", "imputation_link": "first"}
+    )
+
+    sorting_order = {"F": 1, "B": 2, "C": 3}
+    merged_df["sort_column"] = merged_df["link_type"].map(sorting_order)
+
+    merged_df = merged_df.sort_values(groups + ["sort_column"])
+
+    merged_df.drop("sort_column", axis=1, inplace=True)
+
+    merged_df.reset_index(drop=True, inplace=True)
+
+    merged_df = merged_df[
+        groups
+        + [
+            "imputation_link",
+            "link_type",
+            "count",
+            imputed_value,
+        ]
+    ]
+
+    return merged_df
diff --git a/tests/data/count_data_input.csv b/tests/data/count_data_input.csv
@@ -0,0 +1,19 @@
+group,period,f_count,b_count,c_count
+100,202001,2,3,1
+100,202002,1,0,2
+100,202003,0,2,3
+101,202001,5,2,2
+101,202002,2,5,3
+101,202003,3,3,4
+102,202001,6,4,3
+102,202002,1,4,4
+102,202003,0,1,5
+103,202001,7,8,4
+103,202002,0,2,5
+103,202003,2,1,6
+104,202001,3,5,5
+104,202002,3,6,6
+104,202003,2,3,7
+105,202001,3,7,6
+105,202002,4,2,7
+105,202003,3,5,8
diff --git a/tests/data/merge_counts_output.csv b/tests/data/merge_counts_output.csv
@@ -0,0 +1,34 @@
+identifier,date,sic,cell,forward,backward,construction,question,imputed_value,f_count,b_count,c_count
+70001,202001,12,100,1.0,1.964796824,107.48,40,500,2,3,1
+70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0,2
+70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0,2
+70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200,0,2,3
+70002,202001,12,100,1.0,1.964796824,107.48,40,150,2,3,1
+70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250,2,3,1
+70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0,2
+70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2,3
+70003,202001,12,101,1.0,1.964796824,107.48,40,30,5,2,2
+70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5,3
+70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4
+70004,202001,12,101,1.0,1.964796824,107.48,40,,5,2,2
+70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5,3
+70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3,4
+70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4
+70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750,6,4,3
+70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4,4
+70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520,0,1,5
+70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8,4
+70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160,0,2,5
+70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380,2,1,6
+70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8,4
+70007,202001,13,103,1.635246638,1.725354675,143.7564782,49,,7,8,4
+70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2,5
+70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710,0,2,5
+70007,202003,13,103,0.913631634,1.0,70.24,40,280,2,1,6
+70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5,5
+70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660,3,6,6
+70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220,2,3,7
+70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880,3,5,5
+70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610,3,6,6
+70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90,2,3,7
+70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7,6
diff --git a/tests/data/pivot_imputation_value_output.csv b/tests/data/pivot_imputation_value_output.csv
@@ -0,0 +1,49 @@
+date,sic,cell,question,imputation_link,link_type,count,imputed_value
+202001,12,100,40,1.0,F,2,650
+202001,12,100,40,1.964796824,B,3,650
+202001,12,100,40,107.48,C,1,650
+202001,12,100,49,1.326666382,F,2,250
+202001,12,100,49,0.599374673,B,3,250
+202001,12,100,49,92.06463773,C,1,250
+202001,12,101,40,1.0,F,5,30
+202001,12,101,40,1.964796824,B,2,30
+202001,12,101,40,107.48,C,2,30
+202001,12,102,49,1.326666382,F,6,750
+202001,12,102,49,0.599374673,B,4,750
+202001,12,102,49,92.06463773,C,3,750
+202001,13,103,40,0.5089584770000001,F,7,640
+202001,13,103,40,0.832579579,B,8,640
+202001,13,103,40,54.70285714,C,4,640
+202001,13,103,49,1.635246638,F,7,0.0
+202001,13,103,49,1.725354675,B,8,0.0
+202001,13,103,49,143.7564782,C,4,0.0
+202001,13,104,40,0.5089584770000001,F,3,0.0
+202001,13,104,40,0.832579579,B,5,0.0
+202001,13,104,40,54.70285714,C,5,0.0
+202001,13,104,49,1.635246638,F,3,880
+202001,13,104,49,1.725354675,B,5,880
+202001,13,104,49,143.7564782,C,5,880
+202001,13,105,40,0.5089584770000001,F,3,80
+202001,13,105,40,0.832579579,B,7,80
+202001,13,105,40,54.70285714,C,6,80
+202002,12,100,40,0.5089584770000001,F,1,650
+202002,12,100,40,0.832579579,B,0,650
+202002,12,100,40,54.70285714,C,2,650
+202002,12,100,49,0.5016253708279402,F,1,100
+202002,12,100,49,0.812637487,B,0,100
+202002,12,100,49,36.63728374,C,2,100
+202002,12,101,40,0.5089584770000001,F,2,360
+202002,12,101,40,0.832579579,B,5,360
+202002,12,101,40,54.70285714,C,3,360
+202002,12,102,49,0.5016253708279402,F,1,940
+202002,12,102,49,0.812637487,B,4,940
+202002,12,102,49,36.63728374,C,4,940
+202002,13,103,40,0.529851921,F,0,590
+202002,13,103,40,1.09453303,B,2,590
+202002,13,103,40,106.79005520000001,C,5,590
+202002,13,103,49,1.201086389,F,0,870
+202002,13,103,49,0.705052735,B,2,870
+202002,13,103,49,98.26519337,C,5,870
+202002,13,104,49,1.201086389,F,3,1270
+202002,13,104,49,0.705052735,B,6,1270
+202002,13,104,49,98.26519337,C,6,1270
diff --git a/tests/test_pivot_imputation_value.py b/tests/test_pivot_imputation_value.py
@@ -0,0 +1,84 @@
+from pathlib import Path
+
+import pandas as pd
+import pytest
+from pandas.testing import assert_frame_equal
+
+from mbs_results.pivot_imputation_value import merge_counts, pivot_imputation_value
+
+
+@pytest.fixture(scope="class")
+def filepath():
+    return Path("tests")
+
+
+@pytest.fixture(scope="class")
+def count_data_input(filepath):
+    return pd.read_csv(filepath / "data" / "count_data_input.csv", index_col=False)
+
+
+@pytest.fixture(scope="class")
+def merge_counts_output(filepath):
+    return pd.read_csv(filepath / "data" / "merge_counts_output.csv", index_col=False)
+
+
+@pytest.fixture(scope="class")
+def pivot_imputation_value_output(filepath):
+    return pd.read_csv(
+        filepath / "data" / "pivot_imputation_value_output.csv", index_col=False
+    )
+
+
+class TestMergeCounts:
+    def test_merge_counts(self, count_data_input, merge_counts_output):
+
+        input_df = merge_counts_output.drop(columns=["f_count", "b_count", "c_count"])
+
+        actual_output = merge_counts(
+            input_df, count_data_input, "cell", "group", "date", "period", "identifier"
+        )
+        expected_output = merge_counts_output
+
+        assert_frame_equal(actual_output, expected_output)
+
+
+class TestPivotImputationValue:
+    def test_pivot_imputation_value_filter(
+        self, pivot_imputation_value_output, merge_counts_output
+    ):
+
+        expected_output = pivot_imputation_value_output.query("date == 202001")
+
+        input_data = merge_counts_output.drop(columns=["identifier"])
+
+        actual_output = pivot_imputation_value(
+            input_data,
+            "identifier",
+            ["date", "sic", "cell", "question"],
+            ["forward", "backward", "construction"],
+            ["f_count", "b_count", "c_count"],
+            "imputed_value",
+            [202001],
+        )
+
+        assert_frame_equal(actual_output, expected_output)
+
+    def test_pivot_imputation_value_no_filter(
+        self, pivot_imputation_value_output, merge_counts_output
+    ):
+
+        expected_output = pivot_imputation_value_output
+
+        input_data = merge_counts_output.drop(columns=["identifier"])
+        input_data = input_data.query("date in [202001, 202002]")
+
+        actual_output = pivot_imputation_value(
+            input_data,
+            "identifier",
+            ["date", "sic", "cell", "question"],
+            ["forward", "backward", "construction"],
+            ["f_count", "b_count", "c_count"],
+            "imputed_value",
+        )
+
+        assert_frame_equal(actual_output, expected_output)