331 apply imputation link to target (#19)

* add test data * some refactoring before function * added construction case to test data * refactored into functions * add test for higher level function
ONSdigital · Jun 3, 2024 · d2e8a38 · d2e8a38
1 parent a27bb91
commit d2e8a38
Show file tree

Hide file tree

Showing 7 changed files with 230 additions and 0 deletions.
diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py
@@ -0,0 +1,161 @@
+def create_and_merge_imputation_values(
+    df,
+    imputation_class,
+    reference,
+    period,
+    marker,
+    combined_imputation,
+    target,
+    cumulative_forward_link,
+    cumulative_backward_link,
+    auxiliary,
+    construction_link,
+    imputation_types=("c", "fir", "bir", "fic"),
+):
+    """
+    Loop through different imputation types and merge the results according
+    to an imputation marker column
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+    imputation_class : str
+        column name for the variable that defines the imputation class
+    reference : str
+        column name for the reference
+    period : str
+        column name for the period
+    marker : str
+        column name containing a marker to indicate the type of imputation required
+    combined_imputation : str
+        column name for the combined imputation types according to the imputation marker
+    target : str
+        column name for the target variable for imputation
+    cumulative_forward_link : str
+        column name for the cumulative forward imputation link
+    cumulative_backward_link : str
+        column name for the cumulative backward imputation link
+    auxiliary : str
+        column name for auxiliary variable
+    construction_link : str
+        column name for contruction link
+    imputation_types : tup
+        types of imputation to run and add to combined_imputation column stored in a
+        tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'.
+        For 'fic' to produce the correct result, the C marker must be in the first
+        period for a given reference.
+
+    Returns
+    -------
+    pandas.DataFrame
+        dataframe with imputation values defined by the imputation marker
+    """
+
+    # constructed has to come first to use the result for forward impute from contructed
+    imputation_config = {
+        "c": {
+            "intermediate_column": "constructed",
+            "marker": "C",
+            # doesn't actually apply a fill so can be forward or back
+            "fill_column": auxiliary,
+            "fill_method": "ffill",
+            "link_column": construction_link,
+        },
+        "fir": {
+            "intermediate_column": "fir",
+            "marker": "FIR",
+            "fill_column": target,
+            "fill_method": "ffill",
+            "link_column": cumulative_forward_link,
+        },
+        "bir": {
+            "intermediate_column": "bir",
+            "marker": "BIR",
+            "fill_column": target,
+            "fill_method": "bfill",
+            "link_column": cumulative_backward_link,
+        },
+        "fic": {
+            # FIC only works if the C is in the first period of the business being
+            # sampled. This is fine for automatic imputation, but should be careful
+            # if manual construction imputation is done
+            "intermediate_column": "fic",
+            "marker": "FIC",
+            # this has to have the same name as the intermediate column for constructed
+            "fill_column": "constructed",
+            "fill_method": "ffill",
+            "link_column": cumulative_forward_link,
+        },
+    }
+
+    df.sort_values([imputation_class, reference, period], inplace=True)
+
+    intermediate_columns = []
+
+    for imp_type in imputation_types:
+        df = create_impute(
+            df, [imputation_class, reference], imputation_config[imp_type]
+        )
+        df = merge_imputation_type(
+            df, imputation_config[imp_type], marker, combined_imputation
+        )
+
+        intermediate_columns.append(imputation_config[imp_type]["intermediate_column"])
+
+    return df.drop(columns=intermediate_columns)
+
+
+def create_impute(df, group, imputation_spec):
+    """
+    Add a new column to a dataframe of imputed values using ratio imputation.
+
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+    group : str or list
+        variables that define the imputation class
+    imputation_spec: dict
+        dictionary defining the details of the imputation type
+
+    Returns
+    -------
+    pandas.DataFrame
+        dataframe with an added imputation column defined by the imputation_spec
+    """
+    column_name = imputation_spec["intermediate_column"]
+    fill_column = imputation_spec["fill_column"]
+    fill_method = imputation_spec["fill_method"]
+    link_column = imputation_spec["link_column"]
+
+    df[column_name] = (
+        df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column]
+    )
+    return df
+
+
+def merge_imputation_type(df, imputation_spec, marker, combined_imputation):
+    """
+    Uses an existing column of imputed values and a imputation marker to merge values
+    into a single column
+
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+    imputation_spec: dict
+        dictionary defining the details of the imputation type
+    marker : str
+        column name containing a marker to indicate the type of imputation required
+    combined_imputation : str
+        column name for the combined imputation types according to the imputation marker
+
+    Returns
+    -------
+    pandas.DataFrame
+        dataframe with combined_imputation
+    """
+
+    imputation_marker = imputation_spec["marker"]
+    imputation_column = imputation_spec["intermediate_column"]
+
+    df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column]
+    return df
diff --git a/tests/apply_imputation_link.csv b/tests/apply_imputation_link.csv
@@ -0,0 +1,10 @@
+strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link
+100,100000,200,202402,1,2,1,,,R,,,
+100,100000,,202403,2,0.6,2,2,0.6,FIR,400,,
+100,100000,,202404,3,1,2,6,1,FIR,1200,,
+200,100001,,202402,1,4,3,1,2,BIR,600,,
+200,100001,,202403,3,0.5,3,3,0.5,BIR,150,,
+200,100001,300,202404,0.5,1,4,,,R,,,
+300,100002,,202402,1,4,5,1,2,C,600,40,0.1
+300,100002,,202403,3,0.5,5,3,0.5,FIC,150,,
+300,100002,,202404,0.5,1,5,2,,FIC,,,
diff --git a/tests/data/apply_imputation_link/BIR.csv b/tests/data/apply_imputation_link/BIR.csv
@@ -0,0 +1,4 @@
+imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value
+200,100001,,202402,4,2,BIR,600
+200,100001,,202403,0.5,0.5,BIR,150
+200,100001,300,202404,1,,R,
diff --git a/tests/data/apply_imputation_link/C_FIC.csv b/tests/data/apply_imputation_link/C_FIC.csv
@@ -0,0 +1,4 @@
+imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value
+300,100002,,202402,1,,0.1,1000,C,100
+300,100002,,202403,3,3,,,FIC,300
+300,100002,,202404,0.5,1.5,,,FIC,150
diff --git a/tests/data/apply_imputation_link/FIR.csv b/tests/data/apply_imputation_link/FIR.csv
@@ -0,0 +1,4 @@
+imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value
+100,100000,200,202402,1,,R,
+100,100000,,202403,2,2,FIR,400
+100,100000,,202404,3,6,FIR,1200
diff --git a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv
@@ -0,0 +1,10 @@
+imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value
+100,100000,200,202402,1,2,,,,,R,
+100,100000,,202403,2,0.6,,,2,0.6,FIR,400
+100,100000,,202404,3,1,,,6,1,FIR,1200
+200,100001,,202402,1,4,,,1,2,BIR,600
+200,100001,,202403,3,0.5,,,3,0.5,BIR,150
+200,100001,300,202404,0.5,1,,,,,R,
+300,100002,,202402,1,4,1000,0.1,,2,C,100
+300,100002,,202403,3,0.5,,,3,0.5,FIC,300
+300,100002,,202404,0.5,1,,,1.5,,FIC,150
diff --git a/tests/test_apply_imputation_link.py b/tests/test_apply_imputation_link.py
@@ -0,0 +1,37 @@
+from pathlib import Path
+
+import pytest
+from helper_functions import load_and_format
+from pandas.testing import assert_frame_equal
+
+from src.apply_imputation_link import create_and_merge_imputation_values
+
+
+@pytest.fixture(scope="class")
+def fir_bir_c_fic_test_data():
+    return load_and_format(
+        Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv"
+    )
+
+
+class TestApplyImputationLink:
+    def test_all_imputation_types(self, fir_bir_c_fic_test_data):
+        expected_output = fir_bir_c_fic_test_data
+
+        input_data = expected_output.drop(columns=["imputed_value"])
+        actual_output = create_and_merge_imputation_values(
+            input_data,
+            "imputation_class",
+            "reference",
+            "period",
+            "imputation_marker",
+            "imputed_value",
+            "target",
+            "cumulative_forward_link",
+            "cumulative_backward_link",
+            "auxiliary_variable",
+            "construction_link",
+            imputation_types=("c", "fir", "bir", "fic"),
+        )
+
+        assert_frame_equal(actual_output, expected_output)