From f7ba3968b7037622bc267bdcd89620eec99f3144 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Mon, 13 May 2024 17:21:07 +0100 Subject: [PATCH 01/12] Add function for forward, backward link --- src/forward_link.py | 105 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 src/forward_link.py diff --git a/src/forward_link.py b/src/forward_link.py new file mode 100644 index 00000000..7b648963 --- /dev/null +++ b/src/forward_link.py @@ -0,0 +1,105 @@ +from typing import List + +import numpy as np +import pandas as pd + + +def zerofy_values( + df: pd.DataFrame, target_variable: List[str] or str, expr: str +) -> pd.DataFrame: + """Convert values in a dataframe column to 0 based on a python expression + + Parameters + ---------- + df : pd.Dataframe + Pandas dataframe of original data. + target_variable : List[str] or str + Column name(s) containing target variable(s). + query : str + The expression to evaluate, see here: + https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html + + Returns + ------- + df : pd.Dataframe + + + """ + + try: + df.loc[~(df.eval(expr)), target_variable] = 0 + + except ValueError: + print( + f"""{expr} is not a valid expression, + the code uses ~(df.eval({expr}) to mask the dataframe, please see here: + https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html + """ + ) + + +def get_link( + df: pd.DataFrame, + groups: List[str] or str, + match_col: str, + target_variable: str, + predictive_variable: str, + filter_cond: str = None, +) -> pd.DataFrame: + """ + Calculate link between target_variable and predictive_variable by given groups, + a match_col must be supplied which indicates if target_variable and + predictive_variable can be linked. If an optional filter_cond is given + it excludes them when calculating the links. + + Parameters + ---------- + df : pd.Dataframe + Original dataframe. + groups : List[str] or str + Column name(s) to calculate the sums. + match_col : str + Column of the matched pair links, this column should be bool, + or 0 and 1. + target_variable : str + Column name of the targeted variable. + predictive_variable : str + Column name of the predicted target variable. + filter_cond : str, optional + Expression to exclude specific values from the links. + The default is None. + + Returns + ------- + link : pd.Series + A pandas series with the links. + """ + + df_intermediate = df.copy() + + # If condition supplied exclude filtered values from links + if filter_cond is not None: + + df_intermediate.zerofy_values( + [target_variable, predictive_variable], filter_cond + ) + + df_intermediate[target_variable] = ( + df_intermediate[target_variable] * df_intermediate[match_col] + ) + + df_intermediate[predictive_variable] = ( + df_intermediate[predictive_variable] * df_intermediate[match_col] + ) + + numerator = df_intermediate.groupby(groups)[target_variable].transform("sum") + + denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum") + + denominator.replace(0, np.nan, inplace=True) # cover division with 0 + + link = numerator / denominator + + link.replace(np.nan, 1, inplace=True) # set defaults + + return link From dd2b3024112a1850bd2ebf4b35dd303e2fbede4b Mon Sep 17 00:00:00 2001 From: zogkoa Date: Wed, 15 May 2024 13:53:07 +0100 Subject: [PATCH 02/12] Add unit tests for link filters --- tests/test_forward_link.py | 144 +++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 tests/test_forward_link.py diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py new file mode 100644 index 00000000..a7b4009e --- /dev/null +++ b/tests/test_forward_link.py @@ -0,0 +1,144 @@ +import numpy as np +import pandas as pd +from pandas.testing import assert_frame_equal + +from src.forward_link import zerofy_values + + +class TestFilters: + # based on 02_C_FI_input + df = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [202001, 202002, 202001, 202002, 202001, 202002, 202001, 202002], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [2536.0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + def test_basic_filter(self): + """Test a basic filter, filters questions with identifier different to 20001""" + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "identifier != '20001'" + + df_copy = self.df.copy() + + zerofy_values(df_copy, "question", link_filter) + + assert_frame_equal(df_copy, expected) + + def test_basic_multiple_columns(self): + """Test a basic filter in more than 1 column""" + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [0, 0, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "identifier != '20001'" + + df_copy = self.df.copy() + + zerofy_values(df_copy, ["question", "other"], link_filter) + + assert_frame_equal(df_copy, expected) + + def test_basic_multiple_values(self): + """ + Test a filter in multiple values, filters questions which aren't + in ('20001', '20002') + """ + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 0, 0, 0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "identifier not in ('20001', '20002')" + + df_copy = self.df.copy() + + zerofy_values(df_copy, "question", link_filter) + + assert_frame_equal(df_copy, expected) + + def test_multiple_filters(self): + """ + Test multiple conditions, filters questions which aren't in date 202001 + and identifier in 20001 in the same time + """ + + expected = pd.DataFrame( + data={ + "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], + "date": [ + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + 202001, + 202002, + ], + "group": [100, 100, 100, 100, 100, 100, 100, 100], + "question": [0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], + "other": [35, 35, 72, 72, 77, 77, 30, 30], + } + ) + + link_filter = "not(date == '202001' and identifier in ('20001'))" + + df_copy = self.df.copy() + + zerofy_values(df_copy, "question", link_filter) + + assert_frame_equal(df_copy, expected) From 3562d2f5886fb63f70ca7e0c96ae808d9f17ccc9 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Wed, 15 May 2024 13:55:55 +0100 Subject: [PATCH 03/12] Add unit tests for get_link function --- tests/test_forward_link.py | 198 ++++++++++++++++++++++++++++++++++++- 1 file changed, 196 insertions(+), 2 deletions(-) diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index a7b4009e..bcf240da 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,8 +1,8 @@ import numpy as np import pandas as pd -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal -from src.forward_link import zerofy_values +from src.forward_link import get_link, zerofy_values class TestFilters: @@ -142,3 +142,197 @@ def test_multiple_filters(self): zerofy_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) + + +class TestLink: + + # from scenario 33_multi_variable_C_BI_R + # We could parametrise this with more scenarios if needed + df = pd.DataFrame( + data={ + "identifier": [ + 10001, + 10001, + 10001, + 10002, + 10002, + 10002, + 10001, + 10001, + 10001, + 10002, + 10002, + 10002, + 10005, + 10005, + 10005, + ], + "date": [ + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + 202001, + 202002, + 202003, + ], + "group": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2], + "question": [ + 547.0, + 362.0, + 895.0, + 381.0, + 573.0, + 214.0, + 961.0, + 267.0, + 314.0, + 555.0, + 628.0, + 736.0, + np.nan, + np.nan, + 100.0, + ], + "f_predictive_question": [ + np.nan, + 547.0, + 362.0, + np.nan, + 381.0, + 573.0, + np.nan, + 961.0, + 267.0, + np.nan, + 555.0, + 628.0, + np.nan, + np.nan, + np.nan, + ], + "b_predictive_question": [ + 362.0, + 895.0, + np.nan, + 573.0, + 214.0, + np.nan, + 267.0, + 314.0, + np.nan, + 628.0, + 736.0, + np.nan, + np.nan, + 100.0, + np.nan, + ], + "f_matched_pair": [ + False, + True, + True, + False, + True, + True, + False, + True, + True, + False, + True, + True, + False, + False, + False, + ], + "b_matched_pair": [ + True, + True, + False, + True, + True, + False, + True, + True, + False, + True, + True, + False, + False, + False, + False, + ], + } + ) + + def test_forward_link(self): + + expected_f_link = pd.Series( + [ + 1.0, + 1.0075431034482758, + 1.186096256684492, + 1.0, + 1.0075431034482758, + 1.186096256684492, + 1.0, + 0.5903693931398417, + 1.1731843575418994, + 1.0, + 0.5903693931398417, + 1.1731843575418994, + 1.0, + 0.5903693931398417, + 1.1731843575418994, + ] + ) + + f_link = get_link( + self.df, + ["group", "date"], + "f_matched_pair", + "question", + "f_predictive_question", + ) + + assert_series_equal(f_link, expected_f_link) + + def test_backward_link(self): + + expected_b_link = pd.Series( + [ + 0.9925133689839573, + 0.8431018935978359, + 1.0, + 0.9925133689839573, + 0.8431018935978359, + 1.0, + 1.693854748603352, + 0.8523809523809524, + 1.0, + 1.693854748603352, + 0.8523809523809524, + 1.0, + 0.9925133689839573, + 0.8523809523809524, + 1.0, + ] + ) + + b_link = get_link( + self.df, + ["group", "date"], + "b_matched_pair", + "question", + "b_predictive_question", + ) + + assert_series_equal(b_link, expected_b_link) From d31535fa597d8e61fb4f6670cf7ee9afb3c61e24 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 16 May 2024 11:35:41 +0100 Subject: [PATCH 04/12] Rename zerofy_values function to mask_values --- src/forward_link.py | 10 ++++++---- tests/test_forward_link.py | 10 +++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 7b648963..3926abdb 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -2,9 +2,10 @@ import numpy as np import pandas as pd +from pandas.core.base import PandasObject -def zerofy_values( +def mask_values( df: pd.DataFrame, target_variable: List[str] or str, expr: str ) -> pd.DataFrame: """Convert values in a dataframe column to 0 based on a python expression @@ -38,6 +39,9 @@ def zerofy_values( ) +PandasObject.mask_values = mask_values + + def get_link( df: pd.DataFrame, groups: List[str] or str, @@ -80,9 +84,7 @@ def get_link( # If condition supplied exclude filtered values from links if filter_cond is not None: - df_intermediate.zerofy_values( - [target_variable, predictive_variable], filter_cond - ) + df_intermediate.mask_values([target_variable, predictive_variable], filter_cond) df_intermediate[target_variable] = ( df_intermediate[target_variable] * df_intermediate[match_col] diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index bcf240da..80f35ef9 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -2,7 +2,7 @@ import pandas as pd from pandas.testing import assert_frame_equal, assert_series_equal -from src.forward_link import get_link, zerofy_values +from src.forward_link import get_link, mask_values class TestFilters: @@ -43,7 +43,7 @@ def test_basic_filter(self): df_copy = self.df.copy() - zerofy_values(df_copy, "question", link_filter) + mask_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) @@ -73,7 +73,7 @@ def test_basic_multiple_columns(self): df_copy = self.df.copy() - zerofy_values(df_copy, ["question", "other"], link_filter) + mask_values(df_copy, ["question", "other"], link_filter) assert_frame_equal(df_copy, expected) @@ -106,7 +106,7 @@ def test_basic_multiple_values(self): df_copy = self.df.copy() - zerofy_values(df_copy, "question", link_filter) + mask_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) @@ -139,7 +139,7 @@ def test_multiple_filters(self): df_copy = self.df.copy() - zerofy_values(df_copy, "question", link_filter) + mask_values(df_copy, "question", link_filter) assert_frame_equal(df_copy, expected) From 3509145dddc391e1ce512f214572aa6a8b60e335 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 16 May 2024 11:43:47 +0100 Subject: [PATCH 05/12] Rename get_link function to calculate_imputation_link --- src/forward_link.py | 2 +- tests/test_forward_link.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 3926abdb..1cd6aca3 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -42,7 +42,7 @@ def mask_values( PandasObject.mask_values = mask_values -def get_link( +def calculate_imputation_link( df: pd.DataFrame, groups: List[str] or str, match_col: str, diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 80f35ef9..583b5485 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -2,7 +2,7 @@ import pandas as pd from pandas.testing import assert_frame_equal, assert_series_equal -from src.forward_link import get_link, mask_values +from src.forward_link import calculate_imputation_link, mask_values class TestFilters: @@ -295,7 +295,7 @@ def test_forward_link(self): ] ) - f_link = get_link( + f_link = calculate_imputation_link( self.df, ["group", "date"], "f_matched_pair", @@ -327,7 +327,7 @@ def test_backward_link(self): ] ) - b_link = get_link( + b_link = calculate_imputation_link( self.df, ["group", "date"], "b_matched_pair", From a3067c1e5faa69548dc859d98dd67d1cc0269010 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 10:40:54 +0100 Subject: [PATCH 06/12] Update mask_values to return a series --- src/forward_link.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 1cd6aca3..30e99fb4 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -2,12 +2,9 @@ import numpy as np import pandas as pd -from pandas.core.base import PandasObject -def mask_values( - df: pd.DataFrame, target_variable: List[str] or str, expr: str -) -> pd.DataFrame: +def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series: """Convert values in a dataframe column to 0 based on a python expression Parameters @@ -22,13 +19,14 @@ def mask_values( Returns ------- - df : pd.Dataframe + df : pd.Series """ + masked_column = df[target_variable].copy() try: - df.loc[~(df.eval(expr)), target_variable] = 0 + masked_column.loc[~(df.eval(expr))] = np.nan except ValueError: print( @@ -38,8 +36,7 @@ def mask_values( """ ) - -PandasObject.mask_values = mask_values + return masked_column def calculate_imputation_link( From 4ddd9319b08fafa10ac1cc180f7bab6f996a71ec Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 10:44:30 +0100 Subject: [PATCH 07/12] Remove mask_values from calculate_links function --- src/forward_link.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 30e99fb4..14870292 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -45,13 +45,11 @@ def calculate_imputation_link( match_col: str, target_variable: str, predictive_variable: str, - filter_cond: str = None, -) -> pd.DataFrame: +) -> pd.Series: """ Calculate link between target_variable and predictive_variable by given groups, a match_col must be supplied which indicates if target_variable and - predictive_variable can be linked. If an optional filter_cond is given - it excludes them when calculating the links. + predictive_variable can be linked. Parameters ---------- @@ -66,9 +64,6 @@ def calculate_imputation_link( Column name of the targeted variable. predictive_variable : str Column name of the predicted target variable. - filter_cond : str, optional - Expression to exclude specific values from the links. - The default is None. Returns ------- @@ -78,11 +73,6 @@ def calculate_imputation_link( df_intermediate = df.copy() - # If condition supplied exclude filtered values from links - if filter_cond is not None: - - df_intermediate.mask_values([target_variable, predictive_variable], filter_cond) - df_intermediate[target_variable] = ( df_intermediate[target_variable] * df_intermediate[match_col] ) @@ -99,6 +89,4 @@ def calculate_imputation_link( link = numerator / denominator - link.replace(np.nan, 1, inplace=True) # set defaults - return link From 22fa19e8fd566a5f23c6e50c4d922ea973a71978 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 13:52:16 +0100 Subject: [PATCH 08/12] Add test data for calculate_links --- tests/calculate_links_test_data.csv | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 tests/calculate_links_test_data.csv diff --git a/tests/calculate_links_test_data.csv b/tests/calculate_links_test_data.csv new file mode 100755 index 00000000..72e6408d --- /dev/null +++ b/tests/calculate_links_test_data.csv @@ -0,0 +1,16 @@ +,identifier,period,group,question,f_predictive_question,b_predictive_question,f_matched_pair,b_matched_pair,f_link,b_link +0,10001,202001,1,547.0,,362.0,False,True,,0.9925133689839573 +1,10001,202002,1,362.0,547.0,895.0,True,True,1.0075431034482758,0.8431018935978359 +2,10001,202003,1,895.0,362.0,,True,False,1.186096256684492, +3,10002,202001,1,381.0,,573.0,False,True,,0.9925133689839573 +4,10002,202002,1,573.0,381.0,214.0,True,True,1.0075431034482758,0.8431018935978359 +5,10002,202003,1,214.0,573.0,,True,False,1.186096256684492, +6,10001,202001,2,961.0,,267.0,False,True,,1.693854748603352 +7,10001,202002,2,267.0,961.0,314.0,True,True,0.5903693931398417,0.8523809523809524 +8,10001,202003,2,314.0,267.0,,True,False,1.1731843575418994, +9,10002,202001,2,555.0,,628.0,False,True,,1.693854748603352 +10,10002,202002,2,628.0,555.0,736.0,True,True,0.5903693931398417,0.8523809523809524 +11,10002,202003,2,736.0,628.0,,True,False,1.1731843575418994, +12,10005,202001,1,,,,False,False,,0.9925133689839573 +13,10005,202002,2,,,100.0,False,False,0.5903693931398417,0.8523809523809524 +14,10005,202003,2,100.0,,,False,False,1.1731843575418994, From b2b91e39aac95e3f4b9df52abb3748e1e0a57555 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 13:53:19 +0100 Subject: [PATCH 09/12] Adapt tests for calculate_links with test data --- tests/test_forward_link.py | 204 +++++-------------------------------- 1 file changed, 26 insertions(+), 178 deletions(-) diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 583b5485..d1c0f6cf 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,5 +1,7 @@ import numpy as np import pandas as pd +import pytest +from helper_functions import load_and_format from pandas.testing import assert_frame_equal, assert_series_equal from src.forward_link import calculate_imputation_link, mask_values @@ -144,195 +146,41 @@ def test_multiple_filters(self): assert_frame_equal(df_copy, expected) -class TestLink: +scenarios = ["calculate_links_test_data"] - # from scenario 33_multi_variable_C_BI_R - # We could parametrise this with more scenarios if needed - df = pd.DataFrame( - data={ - "identifier": [ - 10001, - 10001, - 10001, - 10002, - 10002, - 10002, - 10001, - 10001, - 10001, - 10002, - 10002, - 10002, - 10005, - 10005, - 10005, - ], - "date": [ - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - 202001, - 202002, - 202003, - ], - "group": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2], - "question": [ - 547.0, - 362.0, - 895.0, - 381.0, - 573.0, - 214.0, - 961.0, - 267.0, - 314.0, - 555.0, - 628.0, - 736.0, - np.nan, - np.nan, - 100.0, - ], - "f_predictive_question": [ - np.nan, - 547.0, - 362.0, - np.nan, - 381.0, - 573.0, - np.nan, - 961.0, - 267.0, - np.nan, - 555.0, - 628.0, - np.nan, - np.nan, - np.nan, - ], - "b_predictive_question": [ - 362.0, - 895.0, - np.nan, - 573.0, - 214.0, - np.nan, - 267.0, - 314.0, - np.nan, - 628.0, - 736.0, - np.nan, - np.nan, - 100.0, - np.nan, - ], - "f_matched_pair": [ - False, - True, - True, - False, - True, - True, - False, - True, - True, - False, - True, - True, - False, - False, - False, - ], - "b_matched_pair": [ - True, - True, - False, - True, - True, - False, - True, - True, - False, - True, - True, - False, - False, - False, - False, - ], - } - ) - def test_forward_link(self): - - expected_f_link = pd.Series( - [ - 1.0, - 1.0075431034482758, - 1.186096256684492, - 1.0, - 1.0075431034482758, - 1.186096256684492, - 1.0, - 0.5903693931398417, - 1.1731843575418994, - 1.0, - 0.5903693931398417, - 1.1731843575418994, - 1.0, - 0.5903693931398417, - 1.1731843575418994, - ] - ) +@pytest.mark.parametrize("scenario", scenarios) +class TestLinks: + def test_forward_links(self, scenario): + """Test if function returns the f_link column""" + + df_input = load_and_format("tests/" + scenario + ".csv") - f_link = calculate_imputation_link( - self.df, - ["group", "date"], + expected_link = df_input["f_link"] + + link_to_test = calculate_imputation_link( + df_input, + ["group", "period"], "f_matched_pair", "question", "f_predictive_question", ) - assert_series_equal(f_link, expected_f_link) - - def test_backward_link(self): - - expected_b_link = pd.Series( - [ - 0.9925133689839573, - 0.8431018935978359, - 1.0, - 0.9925133689839573, - 0.8431018935978359, - 1.0, - 1.693854748603352, - 0.8523809523809524, - 1.0, - 1.693854748603352, - 0.8523809523809524, - 1.0, - 0.9925133689839573, - 0.8523809523809524, - 1.0, - ] - ) + assert_series_equal(link_to_test, expected_link, check_names=False) + + def test_back_links(self, scenario): + """Test if function returns the b_link column""" + + df_input = load_and_format("tests/" + scenario + ".csv") + + expected_link = df_input["b_link"] - b_link = calculate_imputation_link( - self.df, - ["group", "date"], + link_to_test = calculate_imputation_link( + df_input, + ["group", "period"], "b_matched_pair", "question", "b_predictive_question", ) - assert_series_equal(b_link, expected_b_link) + assert_series_equal(link_to_test, expected_link, check_names=False) From 4bc39c4ff6a52b70d058e0792a1a03f837cd1750 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Fri, 17 May 2024 17:09:31 +0100 Subject: [PATCH 10/12] Remove mask_values unit tests --- tests/test_forward_link.py | 146 +------------------------------------ 1 file changed, 2 insertions(+), 144 deletions(-) diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index d1c0f6cf..74e32005 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,150 +1,8 @@ -import numpy as np -import pandas as pd import pytest from helper_functions import load_and_format -from pandas.testing import assert_frame_equal, assert_series_equal - -from src.forward_link import calculate_imputation_link, mask_values - - -class TestFilters: - # based on 02_C_FI_input - df = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [202001, 202002, 202001, 202002, 202001, 202002, 202001, 202002], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [2536.0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - def test_basic_filter(self): - """Test a basic filter, filters questions with identifier different to 20001""" - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "identifier != '20001'" - - df_copy = self.df.copy() - - mask_values(df_copy, "question", link_filter) - - assert_frame_equal(df_copy, expected) - - def test_basic_multiple_columns(self): - """Test a basic filter in more than 1 column""" - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [0, 0, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "identifier != '20001'" - - df_copy = self.df.copy() - - mask_values(df_copy, ["question", "other"], link_filter) - - assert_frame_equal(df_copy, expected) - - def test_basic_multiple_values(self): - """ - Test a filter in multiple values, filters questions which aren't - in ('20001', '20002') - """ - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 0, 0, 0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "identifier not in ('20001', '20002')" - - df_copy = self.df.copy() - - mask_values(df_copy, "question", link_filter) - - assert_frame_equal(df_copy, expected) - - def test_multiple_filters(self): - """ - Test multiple conditions, filters questions which aren't in date 202001 - and identifier in 20001 in the same time - """ - - expected = pd.DataFrame( - data={ - "identifier": [20001, 20001, 20002, 20002, 20003, 20003, 20004, 20004], - "date": [ - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - 202001, - 202002, - ], - "group": [100, 100, 100, 100, 100, 100, 100, 100], - "question": [0, 8283.0, 9113.0, 2970.0, 5644.0, 989.0, np.nan, np.nan], - "other": [35, 35, 72, 72, 77, 77, 30, 30], - } - ) - - link_filter = "not(date == '202001' and identifier in ('20001'))" - - df_copy = self.df.copy() - - mask_values(df_copy, "question", link_filter) - - assert_frame_equal(df_copy, expected) +from pandas.testing import assert_series_equal +from src.forward_link import calculate_imputation_link scenarios = ["calculate_links_test_data"] From 761c28331d921fec6bcf7a00681b43c5b6e9d0d1 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 21 May 2024 11:15:14 +0100 Subject: [PATCH 11/12] Define strata and period as seperate inputs --- src/forward_link.py | 63 ++++++++++---------------------------- tests/test_forward_link.py | 6 ++-- 2 files changed, 20 insertions(+), 49 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index 14870292..f58e5512 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -1,65 +1,30 @@ -from typing import List - import numpy as np import pandas as pd -def mask_values(df: pd.DataFrame, target_variable: str, expr: str) -> pd.Series: - """Convert values in a dataframe column to 0 based on a python expression - - Parameters - ---------- - df : pd.Dataframe - Pandas dataframe of original data. - target_variable : List[str] or str - Column name(s) containing target variable(s). - query : str - The expression to evaluate, see here: - https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html - - Returns - ------- - df : pd.Series - - - """ - masked_column = df[target_variable].copy() - - try: - masked_column.loc[~(df.eval(expr))] = np.nan - - except ValueError: - print( - f"""{expr} is not a valid expression, - the code uses ~(df.eval({expr}) to mask the dataframe, please see here: - https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.eval.html - """ - ) - - return masked_column - - def calculate_imputation_link( df: pd.DataFrame, - groups: List[str] or str, + period: str, + strata: str, match_col: str, target_variable: str, predictive_variable: str, ) -> pd.Series: """ - Calculate link between target_variable and predictive_variable by given groups, - a match_col must be supplied which indicates if target_variable and - predictive_variable can be linked. + Calculate link between target_variable and predictive_variable by strata, + a match_col must be supplied which indicates if target_variable + and predictive_variable can be linked. Parameters ---------- df : pd.Dataframe Original dataframe. - groups : List[str] or str - Column name(s) to calculate the sums. + period : str + Column name containing time period. + strata : str + Column name containing strata information (sic). match_col : str - Column of the matched pair links, this column should be bool, - or 0 and 1. + Column name of the matched pair links, this column should be bool. target_variable : str Column name of the targeted variable. predictive_variable : str @@ -81,9 +46,13 @@ def calculate_imputation_link( df_intermediate[predictive_variable] * df_intermediate[match_col] ) - numerator = df_intermediate.groupby(groups)[target_variable].transform("sum") + numerator = df_intermediate.groupby([strata, period])[target_variable].transform( + "sum" + ) - denominator = df_intermediate.groupby(groups)[predictive_variable].transform("sum") + denominator = df_intermediate.groupby([strata, period])[ + predictive_variable + ].transform("sum") denominator.replace(0, np.nan, inplace=True) # cover division with 0 diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 74e32005..8012d001 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -18,7 +18,8 @@ def test_forward_links(self, scenario): link_to_test = calculate_imputation_link( df_input, - ["group", "period"], + "period", + "group", "f_matched_pair", "question", "f_predictive_question", @@ -35,7 +36,8 @@ def test_back_links(self, scenario): link_to_test = calculate_imputation_link( df_input, - ["group", "period"], + "period", + "group", "b_matched_pair", "question", "b_predictive_question", From 1eb616ce4b5e435ddfa8b21607d98a3939eac1d5 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Tue, 21 May 2024 15:42:04 +0100 Subject: [PATCH 12/12] Change return type to dataframe, add exceptions too --- src/forward_link.py | 25 +++++++++++++++---- tests/test_forward_link.py | 49 ++++++++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/src/forward_link.py b/src/forward_link.py index f58e5512..1ac97429 100644 --- a/src/forward_link.py +++ b/src/forward_link.py @@ -9,7 +9,7 @@ def calculate_imputation_link( match_col: str, target_variable: str, predictive_variable: str, -) -> pd.Series: +) -> pd.DataFrame: """ Calculate link between target_variable and predictive_variable by strata, a match_col must be supplied which indicates if target_variable @@ -32,12 +32,27 @@ def calculate_imputation_link( Returns ------- - link : pd.Series - A pandas series with the links. + df : pd.DataFrame + A pandas DataFrame with a new column containing either f_link or b_link + based on the input parameters. """ df_intermediate = df.copy() + if match_col == "f_matched_pair" and predictive_variable == "f_predictive_question": + link_col_name = "f_link" + + elif ( + match_col == "b_matched_pair" and predictive_variable == "b_predictive_question" + ): + link_col_name = "b_link" + + else: + raise ValueError( + f""" + {match_col} and {predictive_variable} do not have same wildcard.""" + ) + df_intermediate[target_variable] = ( df_intermediate[target_variable] * df_intermediate[match_col] ) @@ -56,6 +71,6 @@ def calculate_imputation_link( denominator.replace(0, np.nan, inplace=True) # cover division with 0 - link = numerator / denominator + df[link_col_name] = numerator / denominator - return link + return df diff --git a/tests/test_forward_link.py b/tests/test_forward_link.py index 8012d001..51fa63c8 100644 --- a/tests/test_forward_link.py +++ b/tests/test_forward_link.py @@ -1,6 +1,6 @@ import pytest from helper_functions import load_and_format -from pandas.testing import assert_series_equal +from pandas.testing import assert_frame_equal from src.forward_link import calculate_imputation_link @@ -12,11 +12,11 @@ class TestLinks: def test_forward_links(self, scenario): """Test if function returns the f_link column""" - df_input = load_and_format("tests/" + scenario + ".csv") + df_output = load_and_format("tests/" + scenario + ".csv") - expected_link = df_input["f_link"] + df_input = df_output.drop(columns=["f_link"]) - link_to_test = calculate_imputation_link( + df_input = calculate_imputation_link( df_input, "period", "group", @@ -25,16 +25,15 @@ def test_forward_links(self, scenario): "f_predictive_question", ) - assert_series_equal(link_to_test, expected_link, check_names=False) + assert_frame_equal(df_input, df_output, check_like=True) def test_back_links(self, scenario): """Test if function returns the b_link column""" + df_output = load_and_format("tests/" + scenario + ".csv") - df_input = load_and_format("tests/" + scenario + ".csv") + df_input = df_output.drop(columns=["b_link"]) - expected_link = df_input["b_link"] - - link_to_test = calculate_imputation_link( + df_input = calculate_imputation_link( df_input, "period", "group", @@ -43,4 +42,34 @@ def test_back_links(self, scenario): "b_predictive_question", ) - assert_series_equal(link_to_test, expected_link, check_names=False) + assert_frame_equal(df_input, df_output, check_like=True) + + def test_exception(self, scenario): + + df = load_and_format("tests/" + scenario + ".csv") + + with pytest.raises(ValueError): + """ + Test if function is called with wrong arguments, in particular + with f_matched_pair and b_predictive_question or with + b_matched_pair and f_predictive_question. + """ + + df = calculate_imputation_link( + df, + "period", + "group", + "f_matched_pair", + "question", + "b_predictive_question", + ) + with pytest.raises(ValueError): + + df = calculate_imputation_link( + df, + "period", + "group", + "b_matched_pair", + "question", + "f_predictive_question", + )