diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py new file mode 100755 index 00000000..e04104fb --- /dev/null +++ b/src/apply_imputation_link.py @@ -0,0 +1,161 @@ +def create_and_merge_imputation_values( + df, + imputation_class, + reference, + period, + marker, + combined_imputation, + target, + cumulative_forward_link, + cumulative_backward_link, + auxiliary, + construction_link, + imputation_types=("c", "fir", "bir", "fic"), +): + """ + Loop through different imputation types and merge the results according + to an imputation marker column + + Parameters + ---------- + df : pandas.DataFrame + imputation_class : str + column name for the variable that defines the imputation class + reference : str + column name for the reference + period : str + column name for the period + marker : str + column name containing a marker to indicate the type of imputation required + combined_imputation : str + column name for the combined imputation types according to the imputation marker + target : str + column name for the target variable for imputation + cumulative_forward_link : str + column name for the cumulative forward imputation link + cumulative_backward_link : str + column name for the cumulative backward imputation link + auxiliary : str + column name for auxiliary variable + construction_link : str + column name for contruction link + imputation_types : tup + types of imputation to run and add to combined_imputation column stored in a + tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'. + For 'fic' to produce the correct result, the C marker must be in the first + period for a given reference. + + Returns + ------- + pandas.DataFrame + dataframe with imputation values defined by the imputation marker + """ + + # constructed has to come first to use the result for forward impute from contructed + imputation_config = { + "c": { + "intermediate_column": "constructed", + "marker": "C", + # doesn't actually apply a fill so can be forward or back + "fill_column": auxiliary, + "fill_method": "ffill", + "link_column": construction_link, + }, + "fir": { + "intermediate_column": "fir", + "marker": "FIR", + "fill_column": target, + "fill_method": "ffill", + "link_column": cumulative_forward_link, + }, + "bir": { + "intermediate_column": "bir", + "marker": "BIR", + "fill_column": target, + "fill_method": "bfill", + "link_column": cumulative_backward_link, + }, + "fic": { + # FIC only works if the C is in the first period of the business being + # sampled. This is fine for automatic imputation, but should be careful + # if manual construction imputation is done + "intermediate_column": "fic", + "marker": "FIC", + # this has to have the same name as the intermediate column for constructed + "fill_column": "constructed", + "fill_method": "ffill", + "link_column": cumulative_forward_link, + }, + } + + df.sort_values([imputation_class, reference, period], inplace=True) + + intermediate_columns = [] + + for imp_type in imputation_types: + df = create_impute( + df, [imputation_class, reference], imputation_config[imp_type] + ) + df = merge_imputation_type( + df, imputation_config[imp_type], marker, combined_imputation + ) + + intermediate_columns.append(imputation_config[imp_type]["intermediate_column"]) + + return df.drop(columns=intermediate_columns) + + +def create_impute(df, group, imputation_spec): + """ + Add a new column to a dataframe of imputed values using ratio imputation. + + Parameters + ---------- + dataframe : pandas.DataFrame + group : str or list + variables that define the imputation class + imputation_spec: dict + dictionary defining the details of the imputation type + + Returns + ------- + pandas.DataFrame + dataframe with an added imputation column defined by the imputation_spec + """ + column_name = imputation_spec["intermediate_column"] + fill_column = imputation_spec["fill_column"] + fill_method = imputation_spec["fill_method"] + link_column = imputation_spec["link_column"] + + df[column_name] = ( + df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column] + ) + return df + + +def merge_imputation_type(df, imputation_spec, marker, combined_imputation): + """ + Uses an existing column of imputed values and a imputation marker to merge values + into a single column + + Parameters + ---------- + dataframe : pandas.DataFrame + imputation_spec: dict + dictionary defining the details of the imputation type + marker : str + column name containing a marker to indicate the type of imputation required + combined_imputation : str + column name for the combined imputation types according to the imputation marker + + Returns + ------- + pandas.DataFrame + dataframe with combined_imputation + """ + + imputation_marker = imputation_spec["marker"] + imputation_column = imputation_spec["intermediate_column"] + + df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column] + return df diff --git a/tests/apply_imputation_link.csv b/tests/apply_imputation_link.csv new file mode 100644 index 00000000..c81711cd --- /dev/null +++ b/tests/apply_imputation_link.csv @@ -0,0 +1,10 @@ +strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link +100,100000,200,202402,1,2,1,,,R,,, +100,100000,,202403,2,0.6,2,2,0.6,FIR,400,, +100,100000,,202404,3,1,2,6,1,FIR,1200,, +200,100001,,202402,1,4,3,1,2,BIR,600,, +200,100001,,202403,3,0.5,3,3,0.5,BIR,150,, +200,100001,300,202404,0.5,1,4,,,R,,, +300,100002,,202402,1,4,5,1,2,C,600,40,0.1 +300,100002,,202403,3,0.5,5,3,0.5,FIC,150,, +300,100002,,202404,0.5,1,5,2,,FIC,,, diff --git a/tests/data/apply_imputation_link/BIR.csv b/tests/data/apply_imputation_link/BIR.csv new file mode 100755 index 00000000..954700c4 --- /dev/null +++ b/tests/data/apply_imputation_link/BIR.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value +200,100001,,202402,4,2,BIR,600 +200,100001,,202403,0.5,0.5,BIR,150 +200,100001,300,202404,1,,R, diff --git a/tests/data/apply_imputation_link/C_FIC.csv b/tests/data/apply_imputation_link/C_FIC.csv new file mode 100755 index 00000000..7d2424b2 --- /dev/null +++ b/tests/data/apply_imputation_link/C_FIC.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value +300,100002,,202402,1,,0.1,1000,C,100 +300,100002,,202403,3,3,,,FIC,300 +300,100002,,202404,0.5,1.5,,,FIC,150 diff --git a/tests/data/apply_imputation_link/FIR.csv b/tests/data/apply_imputation_link/FIR.csv new file mode 100755 index 00000000..341ece76 --- /dev/null +++ b/tests/data/apply_imputation_link/FIR.csv @@ -0,0 +1,4 @@ +imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value +100,100000,200,202402,1,,R, +100,100000,,202403,2,2,FIR,400 +100,100000,,202404,3,6,FIR,1200 diff --git a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv new file mode 100755 index 00000000..91ec36ec --- /dev/null +++ b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv @@ -0,0 +1,10 @@ +imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value +100,100000,200,202402,1,2,,,,,R, +100,100000,,202403,2,0.6,,,2,0.6,FIR,400 +100,100000,,202404,3,1,,,6,1,FIR,1200 +200,100001,,202402,1,4,,,1,2,BIR,600 +200,100001,,202403,3,0.5,,,3,0.5,BIR,150 +200,100001,300,202404,0.5,1,,,,,R, +300,100002,,202402,1,4,1000,0.1,,2,C,100 +300,100002,,202403,3,0.5,,,3,0.5,FIC,300 +300,100002,,202404,0.5,1,,,1.5,,FIC,150 diff --git a/tests/test_apply_imputation_link.py b/tests/test_apply_imputation_link.py new file mode 100755 index 00000000..568bfcec --- /dev/null +++ b/tests/test_apply_imputation_link.py @@ -0,0 +1,37 @@ +from pathlib import Path + +import pytest +from helper_functions import load_and_format +from pandas.testing import assert_frame_equal + +from src.apply_imputation_link import create_and_merge_imputation_values + + +@pytest.fixture(scope="class") +def fir_bir_c_fic_test_data(): + return load_and_format( + Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv" + ) + + +class TestApplyImputationLink: + def test_all_imputation_types(self, fir_bir_c_fic_test_data): + expected_output = fir_bir_c_fic_test_data + + input_data = expected_output.drop(columns=["imputed_value"]) + actual_output = create_and_merge_imputation_values( + input_data, + "imputation_class", + "reference", + "period", + "imputation_marker", + "imputed_value", + "target", + "cumulative_forward_link", + "cumulative_backward_link", + "auxiliary_variable", + "construction_link", + imputation_types=("c", "fir", "bir", "fic"), + ) + + assert_frame_equal(actual_output, expected_output)