Skip to content

Commit

Permalink
331 apply imputation link to target (#19)
Browse files Browse the repository at this point in the history
* add test data

* some refactoring before function

* added construction case to test data

* refactored into functions

* add test for higher level function
  • Loading branch information
robertswh authored Jun 3, 2024
1 parent a27bb91 commit d2e8a38
Show file tree
Hide file tree
Showing 7 changed files with 230 additions and 0 deletions.
161 changes: 161 additions & 0 deletions src/apply_imputation_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
def create_and_merge_imputation_values(
df,
imputation_class,
reference,
period,
marker,
combined_imputation,
target,
cumulative_forward_link,
cumulative_backward_link,
auxiliary,
construction_link,
imputation_types=("c", "fir", "bir", "fic"),
):
"""
Loop through different imputation types and merge the results according
to an imputation marker column
Parameters
----------
df : pandas.DataFrame
imputation_class : str
column name for the variable that defines the imputation class
reference : str
column name for the reference
period : str
column name for the period
marker : str
column name containing a marker to indicate the type of imputation required
combined_imputation : str
column name for the combined imputation types according to the imputation marker
target : str
column name for the target variable for imputation
cumulative_forward_link : str
column name for the cumulative forward imputation link
cumulative_backward_link : str
column name for the cumulative backward imputation link
auxiliary : str
column name for auxiliary variable
construction_link : str
column name for contruction link
imputation_types : tup
types of imputation to run and add to combined_imputation column stored in a
tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'.
For 'fic' to produce the correct result, the C marker must be in the first
period for a given reference.
Returns
-------
pandas.DataFrame
dataframe with imputation values defined by the imputation marker
"""

# constructed has to come first to use the result for forward impute from contructed
imputation_config = {
"c": {
"intermediate_column": "constructed",
"marker": "C",
# doesn't actually apply a fill so can be forward or back
"fill_column": auxiliary,
"fill_method": "ffill",
"link_column": construction_link,
},
"fir": {
"intermediate_column": "fir",
"marker": "FIR",
"fill_column": target,
"fill_method": "ffill",
"link_column": cumulative_forward_link,
},
"bir": {
"intermediate_column": "bir",
"marker": "BIR",
"fill_column": target,
"fill_method": "bfill",
"link_column": cumulative_backward_link,
},
"fic": {
# FIC only works if the C is in the first period of the business being
# sampled. This is fine for automatic imputation, but should be careful
# if manual construction imputation is done
"intermediate_column": "fic",
"marker": "FIC",
# this has to have the same name as the intermediate column for constructed
"fill_column": "constructed",
"fill_method": "ffill",
"link_column": cumulative_forward_link,
},
}

df.sort_values([imputation_class, reference, period], inplace=True)

intermediate_columns = []

for imp_type in imputation_types:
df = create_impute(
df, [imputation_class, reference], imputation_config[imp_type]
)
df = merge_imputation_type(
df, imputation_config[imp_type], marker, combined_imputation
)

intermediate_columns.append(imputation_config[imp_type]["intermediate_column"])

return df.drop(columns=intermediate_columns)


def create_impute(df, group, imputation_spec):
"""
Add a new column to a dataframe of imputed values using ratio imputation.
Parameters
----------
dataframe : pandas.DataFrame
group : str or list
variables that define the imputation class
imputation_spec: dict
dictionary defining the details of the imputation type
Returns
-------
pandas.DataFrame
dataframe with an added imputation column defined by the imputation_spec
"""
column_name = imputation_spec["intermediate_column"]
fill_column = imputation_spec["fill_column"]
fill_method = imputation_spec["fill_method"]
link_column = imputation_spec["link_column"]

df[column_name] = (
df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column]
)
return df


def merge_imputation_type(df, imputation_spec, marker, combined_imputation):
"""
Uses an existing column of imputed values and a imputation marker to merge values
into a single column
Parameters
----------
dataframe : pandas.DataFrame
imputation_spec: dict
dictionary defining the details of the imputation type
marker : str
column name containing a marker to indicate the type of imputation required
combined_imputation : str
column name for the combined imputation types according to the imputation marker
Returns
-------
pandas.DataFrame
dataframe with combined_imputation
"""

imputation_marker = imputation_spec["marker"]
imputation_column = imputation_spec["intermediate_column"]

df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column]
return df
10 changes: 10 additions & 0 deletions tests/apply_imputation_link.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link
100,100000,200,202402,1,2,1,,,R,,,
100,100000,,202403,2,0.6,2,2,0.6,FIR,400,,
100,100000,,202404,3,1,2,6,1,FIR,1200,,
200,100001,,202402,1,4,3,1,2,BIR,600,,
200,100001,,202403,3,0.5,3,3,0.5,BIR,150,,
200,100001,300,202404,0.5,1,4,,,R,,,
300,100002,,202402,1,4,5,1,2,C,600,40,0.1
300,100002,,202403,3,0.5,5,3,0.5,FIC,150,,
300,100002,,202404,0.5,1,5,2,,FIC,,,
4 changes: 4 additions & 0 deletions tests/data/apply_imputation_link/BIR.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value
200,100001,,202402,4,2,BIR,600
200,100001,,202403,0.5,0.5,BIR,150
200,100001,300,202404,1,,R,
4 changes: 4 additions & 0 deletions tests/data/apply_imputation_link/C_FIC.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value
300,100002,,202402,1,,0.1,1000,C,100
300,100002,,202403,3,3,,,FIC,300
300,100002,,202404,0.5,1.5,,,FIC,150
4 changes: 4 additions & 0 deletions tests/data/apply_imputation_link/FIR.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value
100,100000,200,202402,1,,R,
100,100000,,202403,2,2,FIR,400
100,100000,,202404,3,6,FIR,1200
10 changes: 10 additions & 0 deletions tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value
100,100000,200,202402,1,2,,,,,R,
100,100000,,202403,2,0.6,,,2,0.6,FIR,400
100,100000,,202404,3,1,,,6,1,FIR,1200
200,100001,,202402,1,4,,,1,2,BIR,600
200,100001,,202403,3,0.5,,,3,0.5,BIR,150
200,100001,300,202404,0.5,1,,,,,R,
300,100002,,202402,1,4,1000,0.1,,2,C,100
300,100002,,202403,3,0.5,,,3,0.5,FIC,300
300,100002,,202404,0.5,1,,,1.5,,FIC,150
37 changes: 37 additions & 0 deletions tests/test_apply_imputation_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pathlib import Path

import pytest
from helper_functions import load_and_format
from pandas.testing import assert_frame_equal

from src.apply_imputation_link import create_and_merge_imputation_values


@pytest.fixture(scope="class")
def fir_bir_c_fic_test_data():
return load_and_format(
Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv"
)


class TestApplyImputationLink:
def test_all_imputation_types(self, fir_bir_c_fic_test_data):
expected_output = fir_bir_c_fic_test_data

input_data = expected_output.drop(columns=["imputed_value"])
actual_output = create_and_merge_imputation_values(
input_data,
"imputation_class",
"reference",
"period",
"imputation_marker",
"imputed_value",
"target",
"cumulative_forward_link",
"cumulative_backward_link",
"auxiliary_variable",
"construction_link",
imputation_types=("c", "fir", "bir", "fic"),
)

assert_frame_equal(actual_output, expected_output)

0 comments on commit d2e8a38

Please sign in to comment.