From 157d102ddd9baaa3fdcea85e678caf5d45d00832 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Mon, 10 Jun 2024 17:48:19 +0100 Subject: [PATCH 01/10] fix: check for imputation marker change The imputation_group column should look at imputation_marker for changes instead of missing_value --- src/cumulative_imputation_links.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cumulative_imputation_links.py b/src/cumulative_imputation_links.py index 91dfbed9..5d1584ba 100755 --- a/src/cumulative_imputation_links.py +++ b/src/cumulative_imputation_links.py @@ -46,7 +46,12 @@ def get_cumulative_links( dataframe["imputation_group"] = ( ( - (dataframe["missing_value"].diff(time_difference) != 0) + ( + dataframe["imputation_marker"] + .ne(dataframe["imputation_marker"].shift().bfill()) + .astype(int) + != 0 + ) | (dataframe[strata].diff(time_difference) != 0) | (dataframe[reference].diff(time_difference) != 0) ) From 047cbecb8a93f175bf93132e16c0509bb454706b Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 11 Jun 2024 14:31:46 +0100 Subject: [PATCH 02/10] Change return type to dataframe instead of series --- src/cumulative_imputation_links.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cumulative_imputation_links.py b/src/cumulative_imputation_links.py index 5d1584ba..92f1709f 100755 --- a/src/cumulative_imputation_links.py +++ b/src/cumulative_imputation_links.py @@ -74,4 +74,4 @@ def get_cumulative_links( dataframe["cumulative_" + imputation_link], ) - return dataframe[["imputation_group", "cumulative_" + imputation_link]] + return dataframe From a2a152c2df35bded1787cb693b0272d8aff7a813 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 11 Jun 2024 14:35:23 +0100 Subject: [PATCH 03/10] Adjust test data and tests to use imputation marker To fix a bug with cummulative links we are using imputation marker Add a new column with imputation marker to test data Adapt tests accordingly --- tests/cumulative_links.csv | 14 ++++---- tests/test_cumulative_imputation_links.py | 42 ++++++++++++++--------- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/tests/cumulative_links.csv b/tests/cumulative_links.csv index bef347a5..30c455e0 100755 --- a/tests/cumulative_links.csv +++ b/tests/cumulative_links.csv @@ -1,7 +1,7 @@ -strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link -100,100000,200,202402,1,2,1,, -100,100000,,202403,2,0.6,2,2,0.6 -100,100000,,202404,3,1,2,6,1 -200,100001,,202402,1,4,3,1,2 -200,100001,,202403,3,0.5,3,3,0.5 -200,100001,300,202404,0.5,1,4,, +strata,reference,target,period,forward_imputation_link,backward_imputation_link,imputation_marker,imputation_group,cumulative_forward_imputation_link,cumulative_backward_imputation_link +100,100000,200,202402,1,2,r,1,, +100,100000,,202403,2,0.6,fir,2,2,0.6 +100,100000,,202404,3,1,fir,2,6,1 +200,100001,,202402,1,4,bir,3,1,2 +200,100001,,202403,3,0.5,bir,3,3,0.5 +200,100001,300,202404,0.5,1,r,4,, diff --git a/tests/test_cumulative_imputation_links.py b/tests/test_cumulative_imputation_links.py index bf31094a..f765f61f 100755 --- a/tests/test_cumulative_imputation_links.py +++ b/tests/test_cumulative_imputation_links.py @@ -15,15 +15,16 @@ def cumulative_links_test_data(): class TestComulativeLinks: def test_get_cumulative_links_forward(self, cumulative_links_test_data): input_data = cumulative_links_test_data.drop( - columns=["cumulative_forward_imputation_link", "imputation_group"] - ) - - expected_output = cumulative_links_test_data[ - [ - "imputation_group", + columns=[ + "cumulative_backward_imputation_link", "cumulative_forward_imputation_link", + "imputation_group", ] - ] + ) + + expected_output = cumulative_links_test_data.drop( + columns=["imputation_group", "cumulative_backward_imputation_link"] + ) actual_output = get_cumulative_links( input_data, @@ -36,29 +37,38 @@ def test_get_cumulative_links_forward(self, cumulative_links_test_data): 1, ) + actual_output = actual_output.drop( + columns=["imputation_group", "missing_value"] + ) + assert_frame_equal(actual_output, expected_output) def test_get_cumulative_links_backward(self, cumulative_links_test_data): input_data = cumulative_links_test_data.drop( - columns=["cumulative_backward_imputation_link", "imputation_group"] - ) - - expected_output = cumulative_links_test_data[ - [ - "imputation_group", + columns=[ "cumulative_backward_imputation_link", + "cumulative_forward_imputation_link", + "imputation_group", ] - ] + ) + + expected_output = cumulative_links_test_data.drop( + columns=["imputation_group", "cumulative_backward_imputation_link"] + ) actual_output = get_cumulative_links( input_data, - "b", + "f", "strata", "reference", "target", "period", - "backward_imputation_link", + "forward_imputation_link", 1, ) + actual_output = actual_output.drop( + columns=["imputation_group", "missing_value"] + ) + assert_frame_equal(actual_output, expected_output) From e37d5bc726f83e5afa55145f999018eb2a5e1f59 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 11 Jun 2024 14:39:47 +0100 Subject: [PATCH 04/10] Fix marker strings The markers in the dictionary must be the same as the ones created from imputation_flags --- src/apply_imputation_link.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py index e04104fb..cda65c16 100755 --- a/src/apply_imputation_link.py +++ b/src/apply_imputation_link.py @@ -63,24 +63,21 @@ def create_and_merge_imputation_values( }, "fir": { "intermediate_column": "fir", - "marker": "FIR", + "marker": "fir", "fill_column": target, "fill_method": "ffill", "link_column": cumulative_forward_link, }, "bir": { "intermediate_column": "bir", - "marker": "BIR", + "marker": "bir", "fill_column": target, "fill_method": "bfill", "link_column": cumulative_backward_link, }, "fic": { - # FIC only works if the C is in the first period of the business being - # sampled. This is fine for automatic imputation, but should be careful - # if manual construction imputation is done "intermediate_column": "fic", - "marker": "FIC", + "marker": "firc", # this has to have the same name as the intermediate column for constructed "fill_column": "constructed", "fill_method": "ffill", From f3c55173ebe245a1a47b0b604814adc2a892339d Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 11 Jun 2024 14:42:27 +0100 Subject: [PATCH 05/10] Fix fill column for fic If we use constructed column this won't allow the forward fill which will then be used to multiple columns, we are using imputed_value which has na if relevant row is not constructed --- src/apply_imputation_link.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py index cda65c16..24d37abd 100755 --- a/src/apply_imputation_link.py +++ b/src/apply_imputation_link.py @@ -78,8 +78,7 @@ def create_and_merge_imputation_values( "fic": { "intermediate_column": "fic", "marker": "firc", - # this has to have the same name as the intermediate column for constructed - "fill_column": "constructed", + "fill_column": "imputed_value", "fill_method": "ffill", "link_column": cumulative_forward_link, }, From 891054c67f6a49c32e20181ea690e88549f4b66a Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 11 Jun 2024 15:17:56 +0100 Subject: [PATCH 06/10] Fix c, fic typos for markers --- src/apply_imputation_link.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py index 24d37abd..06b2568a 100755 --- a/src/apply_imputation_link.py +++ b/src/apply_imputation_link.py @@ -55,7 +55,7 @@ def create_and_merge_imputation_values( imputation_config = { "c": { "intermediate_column": "constructed", - "marker": "C", + "marker": "c", # doesn't actually apply a fill so can be forward or back "fill_column": auxiliary, "fill_method": "ffill", @@ -77,7 +77,7 @@ def create_and_merge_imputation_values( }, "fic": { "intermediate_column": "fic", - "marker": "firc", + "marker": "fic", "fill_column": "imputed_value", "fill_method": "ffill", "link_column": cumulative_forward_link, From 55ffb5a49a1f1169120294e03c81c6251f4e7bd9 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 11 Jun 2024 15:19:22 +0100 Subject: [PATCH 07/10] Convert markers to lower case to match function call markers --- .../apply_imputation_link/FIR_BIR_C_FIC.csv | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv index 91ec36ec..56ac7c04 100755 --- a/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv +++ b/tests/data/apply_imputation_link/FIR_BIR_C_FIC.csv @@ -1,10 +1,10 @@ imputation_class,reference,target,period,forward_imputation_link,backward_imputation_link,auxiliary_variable,construction_link,cumulative_forward_link,cumulative_backward_link,imputation_marker,imputed_value -100,100000,200,202402,1,2,,,,,R, -100,100000,,202403,2,0.6,,,2,0.6,FIR,400 -100,100000,,202404,3,1,,,6,1,FIR,1200 -200,100001,,202402,1,4,,,1,2,BIR,600 -200,100001,,202403,3,0.5,,,3,0.5,BIR,150 -200,100001,300,202404,0.5,1,,,,,R, -300,100002,,202402,1,4,1000,0.1,,2,C,100 -300,100002,,202403,3,0.5,,,3,0.5,FIC,300 -300,100002,,202404,0.5,1,,,1.5,,FIC,150 +100,100000,200,202402,1,2,,,,,r, +100,100000,,202403,2,0.6,,,2,0.6,fir,400 +100,100000,,202404,3,1,,,6,1,fir,1200 +200,100001,,202402,1,4,,,1,2,bir,600 +200,100001,,202403,3,0.5,,,3,0.5,bir,150 +200,100001,300,202404,0.5,1,,,,,r, +300,100002,,202402,1,4,1000,0.1,,2,c,100 +300,100002,,202403,3,0.5,,,3,0.5,fic,300 +300,100002,,202404,0.5,1,,,1.5,,fic,150 From 89fb75da8724fdc50c4d2024c8b3d2ede4b7f6a0 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 13 Jun 2024 13:26:59 +0100 Subject: [PATCH 08/10] Fix typos and backward testing --- tests/test_cumulative_imputation_links.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_cumulative_imputation_links.py b/tests/test_cumulative_imputation_links.py index f765f61f..fc758b9d 100755 --- a/tests/test_cumulative_imputation_links.py +++ b/tests/test_cumulative_imputation_links.py @@ -12,7 +12,7 @@ def cumulative_links_test_data(): return load_and_format(Path("tests") / "cumulative_links.csv") -class TestComulativeLinks: +class TestCumulativeLinks: def test_get_cumulative_links_forward(self, cumulative_links_test_data): input_data = cumulative_links_test_data.drop( columns=[ @@ -53,17 +53,17 @@ def test_get_cumulative_links_backward(self, cumulative_links_test_data): ) expected_output = cumulative_links_test_data.drop( - columns=["imputation_group", "cumulative_backward_imputation_link"] + columns=["imputation_group", "cumulative_forward_imputation_link"] ) actual_output = get_cumulative_links( input_data, - "f", + "b", "strata", "reference", "target", "period", - "forward_imputation_link", + "backward_imputation_link", 1, ) From 83996b6b900ef4ab0c7b75fb4ffaf15e4a911571 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 13 Jun 2024 13:28:42 +0100 Subject: [PATCH 09/10] Define conditions at the beginning --- src/cumulative_imputation_links.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/cumulative_imputation_links.py b/src/cumulative_imputation_links.py index 92f1709f..b10e4bc2 100755 --- a/src/cumulative_imputation_links.py +++ b/src/cumulative_imputation_links.py @@ -44,19 +44,20 @@ def get_cumulative_links( dataframe.sort_values([strata, reference, period], inplace=True) dataframe["missing_value"] = np.where(dataframe[target].isnull(), True, False) + # TODO: These conditions are similar with the ones at flags, consider a fun for this + marker_diff_con = ( + dataframe["imputation_marker"] + .ne(dataframe["imputation_marker"].shift().bfill()) + .astype(int) + != 0 + ) + + strat_diff_con = dataframe[strata].diff(time_difference) != 0 + + reference_diff_con = dataframe[reference].diff(time_difference) != 0 + dataframe["imputation_group"] = ( - ( - ( - dataframe["imputation_marker"] - .ne(dataframe["imputation_marker"].shift().bfill()) - .astype(int) - != 0 - ) - | (dataframe[strata].diff(time_difference) != 0) - | (dataframe[reference].diff(time_difference) != 0) - ) - .astype("int") - .cumsum() + (marker_diff_con | strat_diff_con | reference_diff_con).astype("int").cumsum() ) if forward_or_backward == "f": From db9365352302e54476bdc563cc761bf9f65e1f94 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 27 Jun 2024 15:04:25 +0100 Subject: [PATCH 10/10] Add back fic code comments --- src/apply_imputation_link.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/apply_imputation_link.py b/src/apply_imputation_link.py index 06b2568a..d1f6bb6d 100755 --- a/src/apply_imputation_link.py +++ b/src/apply_imputation_link.py @@ -76,6 +76,9 @@ def create_and_merge_imputation_values( "link_column": cumulative_backward_link, }, "fic": { + # FIC only works if the C is in the first period of the business being + # sampled. This is fine for automatic imputation, but should be careful + # if manual construction imputation is done "intermediate_column": "fic", "marker": "fic", "fill_column": "imputed_value",