-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
331 apply imputation link to target (#19)
* add test data * some refactoring before function * added construction case to test data * refactored into functions * add test for higher level function
- Loading branch information
Showing
7 changed files
with
230 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
def create_and_merge_imputation_values( | ||
df, | ||
imputation_class, | ||
reference, | ||
period, | ||
marker, | ||
combined_imputation, | ||
target, | ||
cumulative_forward_link, | ||
cumulative_backward_link, | ||
auxiliary, | ||
construction_link, | ||
imputation_types=("c", "fir", "bir", "fic"), | ||
): | ||
""" | ||
Loop through different imputation types and merge the results according | ||
to an imputation marker column | ||
Parameters | ||
---------- | ||
df : pandas.DataFrame | ||
imputation_class : str | ||
column name for the variable that defines the imputation class | ||
reference : str | ||
column name for the reference | ||
period : str | ||
column name for the period | ||
marker : str | ||
column name containing a marker to indicate the type of imputation required | ||
combined_imputation : str | ||
column name for the combined imputation types according to the imputation marker | ||
target : str | ||
column name for the target variable for imputation | ||
cumulative_forward_link : str | ||
column name for the cumulative forward imputation link | ||
cumulative_backward_link : str | ||
column name for the cumulative backward imputation link | ||
auxiliary : str | ||
column name for auxiliary variable | ||
construction_link : str | ||
column name for contruction link | ||
imputation_types : tup | ||
types of imputation to run and add to combined_imputation column stored in a | ||
tuple. If 'fic' is selected 'c' must also be selected and proceed 'fic'. | ||
For 'fic' to produce the correct result, the C marker must be in the first | ||
period for a given reference. | ||
Returns | ||
------- | ||
pandas.DataFrame | ||
dataframe with imputation values defined by the imputation marker | ||
""" | ||
|
||
# constructed has to come first to use the result for forward impute from contructed | ||
imputation_config = { | ||
"c": { | ||
"intermediate_column": "constructed", | ||
"marker": "C", | ||
# doesn't actually apply a fill so can be forward or back | ||
"fill_column": auxiliary, | ||
"fill_method": "ffill", | ||
"link_column": construction_link, | ||
}, | ||
"fir": { | ||
"intermediate_column": "fir", | ||
"marker": "FIR", | ||
"fill_column": target, | ||
"fill_method": "ffill", | ||
"link_column": cumulative_forward_link, | ||
}, | ||
"bir": { | ||
"intermediate_column": "bir", | ||
"marker": "BIR", | ||
"fill_column": target, | ||
"fill_method": "bfill", | ||
"link_column": cumulative_backward_link, | ||
}, | ||
"fic": { | ||
# FIC only works if the C is in the first period of the business being | ||
# sampled. This is fine for automatic imputation, but should be careful | ||
# if manual construction imputation is done | ||
"intermediate_column": "fic", | ||
"marker": "FIC", | ||
# this has to have the same name as the intermediate column for constructed | ||
"fill_column": "constructed", | ||
"fill_method": "ffill", | ||
"link_column": cumulative_forward_link, | ||
}, | ||
} | ||
|
||
df.sort_values([imputation_class, reference, period], inplace=True) | ||
|
||
intermediate_columns = [] | ||
|
||
for imp_type in imputation_types: | ||
df = create_impute( | ||
df, [imputation_class, reference], imputation_config[imp_type] | ||
) | ||
df = merge_imputation_type( | ||
df, imputation_config[imp_type], marker, combined_imputation | ||
) | ||
|
||
intermediate_columns.append(imputation_config[imp_type]["intermediate_column"]) | ||
|
||
return df.drop(columns=intermediate_columns) | ||
|
||
|
||
def create_impute(df, group, imputation_spec): | ||
""" | ||
Add a new column to a dataframe of imputed values using ratio imputation. | ||
Parameters | ||
---------- | ||
dataframe : pandas.DataFrame | ||
group : str or list | ||
variables that define the imputation class | ||
imputation_spec: dict | ||
dictionary defining the details of the imputation type | ||
Returns | ||
------- | ||
pandas.DataFrame | ||
dataframe with an added imputation column defined by the imputation_spec | ||
""" | ||
column_name = imputation_spec["intermediate_column"] | ||
fill_column = imputation_spec["fill_column"] | ||
fill_method = imputation_spec["fill_method"] | ||
link_column = imputation_spec["link_column"] | ||
|
||
df[column_name] = ( | ||
df.groupby(group)[fill_column].fillna(method=fill_method) * df[link_column] | ||
) | ||
return df | ||
|
||
|
||
def merge_imputation_type(df, imputation_spec, marker, combined_imputation): | ||
""" | ||
Uses an existing column of imputed values and a imputation marker to merge values | ||
into a single column | ||
Parameters | ||
---------- | ||
dataframe : pandas.DataFrame | ||
imputation_spec: dict | ||
dictionary defining the details of the imputation type | ||
marker : str | ||
column name containing a marker to indicate the type of imputation required | ||
combined_imputation : str | ||
column name for the combined imputation types according to the imputation marker | ||
Returns | ||
------- | ||
pandas.DataFrame | ||
dataframe with combined_imputation | ||
""" | ||
|
||
imputation_marker = imputation_spec["marker"] | ||
imputation_column = imputation_spec["intermediate_column"] | ||
|
||
df.loc[df[marker] == imputation_marker, combined_imputation] = df[imputation_column] | ||
return df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value,auxiliary_variable,construction_link | ||
100,100000,200,202402,1,2,1,,,R,,, | ||
100,100000,,202403,2,0.6,2,2,0.6,FIR,400,, | ||
100,100000,,202404,3,1,2,6,1,FIR,1200,, | ||
200,100001,,202402,1,4,3,1,2,BIR,600,, | ||
200,100001,,202403,3,0.5,3,3,0.5,BIR,150,, | ||
200,100001,300,202404,0.5,1,4,,,R,,, | ||
300,100002,,202402,1,4,5,1,2,C,600,40,0.1 | ||
300,100002,,202403,3,0.5,5,3,0.5,FIC,150,, | ||
300,100002,,202404,0.5,1,5,2,,FIC,,, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
imputation_class,reference,target,period,backward_imputation_link,cumulative_backward_imputation_link,imputation_marker,imputed_value | ||
200,100001,,202402,4,2,BIR,600 | ||
200,100001,,202403,0.5,0.5,BIR,150 | ||
200,100001,300,202404,1,,R, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,construction_link,auxiliary_variable,imputation_marker,imputed_value | ||
300,100002,,202402,1,,0.1,1000,C,100 | ||
300,100002,,202403,3,3,,,FIC,300 | ||
300,100002,,202404,0.5,1.5,,,FIC,150 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
imputation_class,reference,target,period,forward_imputation_link,cumulative_forward_imputation_link,imputation_marker,imputed_value | ||
100,100000,200,202402,1,,R, | ||
100,100000,,202403,2,2,FIR,400 | ||
100,100000,,202404,3,6,FIR,1200 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value | ||
100,100000,200,202402,1,2,,,,,R, | ||
100,100000,,202403,2,0.6,,,2,0.6,FIR,400 | ||
100,100000,,202404,3,1,,,6,1,FIR,1200 | ||
200,100001,,202402,1,4,,,1,2,BIR,600 | ||
200,100001,,202403,3,0.5,,,3,0.5,BIR,150 | ||
200,100001,300,202404,0.5,1,,,,,R, | ||
300,100002,,202402,1,4,1000,0.1,,2,C,100 | ||
300,100002,,202403,3,0.5,,,3,0.5,FIC,300 | ||
300,100002,,202404,0.5,1,,,1.5,,FIC,150 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from pathlib import Path | ||
|
||
import pytest | ||
from helper_functions import load_and_format | ||
from pandas.testing import assert_frame_equal | ||
|
||
from src.apply_imputation_link import create_and_merge_imputation_values | ||
|
||
|
||
@pytest.fixture(scope="class") | ||
def fir_bir_c_fic_test_data(): | ||
return load_and_format( | ||
Path("tests") / "data" / "apply_imputation_link" / "FIR_BIR_C_FIC.csv" | ||
) | ||
|
||
|
||
class TestApplyImputationLink: | ||
def test_all_imputation_types(self, fir_bir_c_fic_test_data): | ||
expected_output = fir_bir_c_fic_test_data | ||
|
||
input_data = expected_output.drop(columns=["imputed_value"]) | ||
actual_output = create_and_merge_imputation_values( | ||
input_data, | ||
"imputation_class", | ||
"reference", | ||
"period", | ||
"imputation_marker", | ||
"imputed_value", | ||
"target", | ||
"cumulative_forward_link", | ||
"cumulative_backward_link", | ||
"auxiliary_variable", | ||
"construction_link", | ||
imputation_types=("c", "fir", "bir", "fic"), | ||
) | ||
|
||
assert_frame_equal(actual_output, expected_output) |