From 6c467b8f5f57c7d6efa473f27dc68ab7488b764c Mon Sep 17 00:00:00 2001 From: matthk Date: Fri, 26 Jul 2024 10:24:36 +0100 Subject: [PATCH 1/8] pivot imputation code --- mbs_results/pivot_imputation_value.py | 63 ++++++++++++++++++++ tests/data/pivot_imputation_value_input.csv | 34 +++++++++++ tests/data/pivot_imputation_value_output.csv | 28 +++++++++ tests/test_pivot_imputation_value.py | 63 ++++++++++++++++++++ 4 files changed, 188 insertions(+) create mode 100755 mbs_results/pivot_imputation_value.py create mode 100755 tests/data/pivot_imputation_value_input.csv create mode 100755 tests/data/pivot_imputation_value_output.csv create mode 100755 tests/test_pivot_imputation_value.py diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py new file mode 100755 index 00000000..97ed9233 --- /dev/null +++ b/mbs_results/pivot_imputation_value.py @@ -0,0 +1,63 @@ +import pandas as pd + + +def pivot_imputation_value( + df: pd.DataFrame, + identifier: str, + date: str, + sic: str, + cell: str, + forward: str, + backward: str, + construction: str, + question: str, +) -> pd.DataFrame: + + """ + Returning dataframe containing imputation_value, filtered by date, pivoted by imputation type + and grouped by sic, cell, question and imputation type. + + Parameters + ---------- + dataframe : pd.DataFrame + Reference dataframe with domain, a_weights, o_weights, and g_weights + identifier : str + name of column in dataframe containing identifier variable + date : str + name of column in dataframe containing period variable + sic : str + name of column in dataframe containing domain variable + cell : str + name of column in dataframe containing question code variable + forward : str + name of column in dataframe containing predicted value variable + backward : str + name of column in dataframe containing imputation marker variable + construction : str + name of column in dataframe containing a_weight variable + question : str + name of column in dataframe containing o_weight variable + + Returns + ------- + dataframe filtered by date, containing imputation_value, pivoted by imputation type + and grouped by sic, cell, question and imputation type. + + """ + + df = df[df.date == 202001] + + df = df.drop_duplicates(subset=['date', 'sic', 'cell', 'question']) + + df = df.melt(id_vars=['date', 'sic', 'cell', 'question'], + value_vars=['forward', 'backward', 'construction'], + var_name='link_type', + value_name='imputation_link') + + link_type_map = {'forward': 'F', 'backward': 'B', 'construction': 'C'} + df['link_type'] = df['link_type'].map(link_type_map) + + df['link_type'] = pd.Categorical(df['link_type'], categories=['F','B','C'], ordered=True) + df.sort_values(['date', 'sic', 'cell', 'question', 'link_type'], inplace=True) + + return df.reset_index(drop=True) \ No newline at end of file diff --git a/tests/data/pivot_imputation_value_input.csv b/tests/data/pivot_imputation_value_input.csv new file mode 100755 index 00000000..c8680c98 --- /dev/null +++ b/tests/data/pivot_imputation_value_input.csv @@ -0,0 +1,34 @@ +identifier,date,sic,cell,forward,backward,construction,question +70001,202001,12,100,1.0,1.964796824,107.48,40 +70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40 +70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49 +70001,202003,12,100,1.201086389,0.705052735,98.26519337,49 +70002,202001,12,100,1.0,1.964796824,107.48,40 +70002,202001,12,100,1.326666382,0.599374673,92.06463773,49 +70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40 +70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40 +70003,202001,12,101,1.0,1.964796824,107.48,40 +70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40 +70003,202003,12,101,1.201086389,0.705052735,98.26519337,49 +70004,202001,12,101,1.0,1.964796824,107.48,40 +70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40 +70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40 +70004,202003,12,101,1.201086389,0.705052735,98.26519337,49 +70005,202001,12,102,1.326666382,0.599374673,92.06463773,49 +70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49 +70005,202003,12,102,1.201086389,0.705052735,98.26519337,49 +70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40 +70006,202002,13,103,1.201086389,0.705052735,98.26519337,49 +70006,202003,13,103,1.418333623,0.642249527,93.18857143,49 +70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40 +70007,202001,13,103,1.635246638,1.725354675,143.7564782,49 +70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40 +70007,202002,13,103,1.201086389,0.705052735,98.26519337,49 +70007,202003,13,103,0.913631634,1.0,70.24,40 +70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40 +70008,202002,13,104,1.201086389,0.705052735,98.26519337,49 +70008,202003,13,104,1.418333623,0.642249527,93.18857143,49 +70009,202001,13,104,1.635246638,1.725354675,143.7564782,49 +70009,202002,13,104,1.201086389,0.705052735,98.26519337,49 +70009,202003,13,104,1.418333623,0.642249527,93.18857143,49 +70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40 diff --git a/tests/data/pivot_imputation_value_output.csv b/tests/data/pivot_imputation_value_output.csv new file mode 100755 index 00000000..23ebc61e --- /dev/null +++ b/tests/data/pivot_imputation_value_output.csv @@ -0,0 +1,28 @@ +date,sic,cell,question,imputation_link,link_type +202001,12,100,40,1.0,F +202001,12,100,40,1.964796824,B +202001,12,100,40,107.48,C +202001,12,100,49,1.326666382,F +202001,12,100,49,0.599374673,B +202001,12,100,49,92.06463773,C +202001,12,101,40,1.0,F +202001,12,101,40,1.964796824,B +202001,12,101,40,107.48,C +202001,12,102,49,1.326666382,F +202001,12,102,49,0.599374673,B +202001,12,102,49,92.06463773,C +202001,13,103,40,0.5089584770000001,F +202001,13,103,40,0.832579579,B +202001,13,103,40,54.70285714,C +202001,13,103,49,1.635246638,F +202001,13,103,49,1.725354675,B +202001,13,103,49,143.7564782,C +202001,13,104,40,0.5089584770000001,F +202001,13,104,40,0.832579579,B +202001,13,104,40,54.70285714,C +202001,13,104,49,1.635246638,F +202001,13,104,49,1.725354675,B +202001,13,104,49,143.7564782,C +202001,13,105,40,0.5089584770000001,F +202001,13,105,40,0.832579579,B +202001,13,105,40,54.70285714,C \ No newline at end of file diff --git a/tests/test_pivot_imputation_value.py b/tests/test_pivot_imputation_value.py new file mode 100755 index 00000000..51bf6a2e --- /dev/null +++ b/tests/test_pivot_imputation_value.py @@ -0,0 +1,63 @@ +from pathlib import Path + +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from mbs_results.pivot_imputation_value import pivot_imputation_value + + +@pytest.fixture(scope="class") +def filepath(): + return Path("tests") + + +@pytest.fixture(scope="class") +def pivot_imputation_value_input(filepath): + return pd.read_csv( + filepath / "data"/"pivot_imputation_value_input.csv", index_col=False + ) + +@pytest.fixture(scope="class") +def pivot_imputation_value_output(filepath): + return pd.read_csv( + filepath / "data"/"pivot_imputation_value_output.csv", index_col=False + ) + + +class TestPivotImputationValue: + def test_pivot_imputation_value( + self, + pivot_imputation_value_input, + pivot_imputation_value_output + ): + expected_output = pivot_imputation_value_output[ + [ + "date", + "sic", + "cell", + "question", + "link_type", + "imputation_link", + ] + ].reset_index(drop=True) + + expected_output["link_type"] = pd.Categorical(expected_output["link_type"], categories=["F", "B", "C"], ordered=True) + + input_data = pivot_imputation_value_input.drop( + columns=["identifier"] + ) + + actual_output = pivot_imputation_value( + input_data, + "identifier", + "date", + "sic", + "cell", + "forward", + "backward", + "construction", + "question", + ) + + assert_frame_equal(actual_output, expected_output) From 9c1e76a27ea62e75b666b65a097d9b41304063ab Mon Sep 17 00:00:00 2001 From: matthk Date: Wed, 31 Jul 2024 08:51:52 +0100 Subject: [PATCH 2/8] Adding functions in pivot_imputation_value for imputation_links output --- mbs_results/pivot_imputation_value.py | 118 +++++++++++++++--- .../pivot_imputation_value_counts_input.csv | 19 +++ tests/data/pivot_imputation_value_input.csv | 68 +++++----- tests/data/pivot_imputation_value_input_2.csv | 34 +++++ tests/data/pivot_imputation_value_output.csv | 56 ++++----- tests/test_pivot_imputation_value.py | 62 +++++---- 6 files changed, 258 insertions(+), 99 deletions(-) create mode 100755 tests/data/pivot_imputation_value_counts_input.csv create mode 100755 tests/data/pivot_imputation_value_input_2.csv diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py index 97ed9233..b82c3626 100755 --- a/mbs_results/pivot_imputation_value.py +++ b/mbs_results/pivot_imputation_value.py @@ -1,6 +1,43 @@ +import numpy as np import pandas as pd +def merge_counts(input_df: pd.DataFrame, count_df: pd.DataFrame) -> pd.DataFrame: + """ + Returns input data with f_count and b_count merged on. + + Parameters + ---------- + input_df : pd.DataFrame + Reference dataframe with identifier, date, sic, cell, forward, backward, construction, question,imputed_value + count_df : pd.DataFrame + DataFrame with group, period, flag_1 and flag_2. + input_cell : + name of column in input_df dataframe containing cell variable + count_cell : + name of column in count_df dataframe containing cell variable + input_date : + name of column in input_df dataframe containing date variable + count_date : + name of column in count_df dataframe containing date variable + Returns + Returns + ------- + Dataframe resulting from the left-join of df_2 and df_1 (after renaming columns), on 'cell' and 'date'. + """ + count_df = count_df.rename( + columns={ + "group": "cell", + "period": "date", + "flag_1": "f_count", + "flag_2": "b_count", + } + ) + df_merge = pd.merge(input_df, count_df, how="left", on=["cell", "date"]) + df_merge["identifier"] = df_merge["identifier"].astype(int) + return df_merge + + def pivot_imputation_value( df: pd.DataFrame, identifier: str, @@ -11,8 +48,11 @@ def pivot_imputation_value( backward: str, construction: str, question: str, + imputed_value: str, + f_count: str, + b_count: str, ) -> pd.DataFrame: - + """ Returning dataframe containing imputation_value, filtered by date, pivoted by imputation type and grouped by sic, cell, question and imputation type. @@ -37,27 +77,77 @@ def pivot_imputation_value( name of column in dataframe containing a_weight variable question : str name of column in dataframe containing o_weight variable - + imputed_value: str + name of column in dataframe containing imputed_value variable + f_count: str, + name of column in dataframe containing f_count variable + b_count: str, + name of column in dataframe containing b_count variable + Returns ------- dataframe filtered by date, containing imputation_value, pivoted by imputation type and grouped by sic, cell, question and imputation type. """ - - df = df[df.date == 202001] - df = df.drop_duplicates(subset=['date', 'sic', 'cell', 'question']) + df = df[df[date] == 202001] + + df1 = df.melt( + id_vars=[date, sic, cell, question, imputed_value], + value_vars=[forward, backward, construction], + var_name="link_type", + value_name="imputation_link", + ) + + link_type_map = {forward: "F", backward: "B", construction: "C"} + df1["link_type"] = df1["link_type"].map(link_type_map) + + df2 = df.melt( + id_vars=[date, sic, cell, question], + value_vars=[f_count, b_count], + var_name="link_type_count", + value_name="count", + ) + + link_type_map_count = {f_count: "F", b_count: "B"} + df2["link_type_count"] = df2["link_type_count"].map(link_type_map_count) + + merged_df = pd.merge( + df1, + df2, + how="outer", + left_on=[date, sic, cell, question, "link_type"], + right_on=[date, sic, cell, question, "link_type_count"], + ) + + merged_df.drop_duplicates(inplace=True) + merged_df.drop(["link_type_count"], axis=1, inplace=True) + + merged_df = merged_df.groupby( + [date, sic, cell, question, "link_type"], as_index=False + ).agg({imputed_value: "sum", "count": "first", "imputation_link": "first"}) + + sorting_order = {"F": 1, "B": 2, "C": 3} + merged_df["sort_column"] = merged_df["link_type"].map(sorting_order) + + merged_df = merged_df.sort_values([date, sic, cell, question, "sort_column"]) - df = df.melt(id_vars=['date', 'sic', 'cell', 'question'], - value_vars=['forward', 'backward', 'construction'], - var_name='link_type', - value_name='imputation_link') + merged_df.drop("sort_column", axis=1, inplace=True) - link_type_map = {'forward': 'F', 'backward': 'B', 'construction': 'C'} - df['link_type'] = df['link_type'].map(link_type_map) + merged_df.reset_index(drop=True, inplace=True) - df['link_type'] = pd.Categorical(df['link_type'], categories=['F','B','C'], ordered=True) - df.sort_values(['date', 'sic', 'cell', 'question', 'link_type'], inplace=True) + merged_df = merged_df[ + [ + date, + sic, + cell, + question, + "imputation_link", + "link_type", + "count", + imputed_value, + ] + ] - return df.reset_index(drop=True) \ No newline at end of file + return merged_df diff --git a/tests/data/pivot_imputation_value_counts_input.csv b/tests/data/pivot_imputation_value_counts_input.csv new file mode 100755 index 00000000..308f65a8 --- /dev/null +++ b/tests/data/pivot_imputation_value_counts_input.csv @@ -0,0 +1,19 @@ +group,period,flag_1,flag_2 +100,202001,2,3 +100,202002,1,0 +100,202003,0,2 +101,202001,5,2 +101,202002,2,5 +101,202003,3,3 +102,202001,6,4 +102,202002,1,4 +102,202003,0,1 +103,202001,7,8 +103,202002,0,2 +103,202003,2,1 +104,202001,3,5 +104,202002,3,6 +104,202003,2,3 +105,202001,3,7 +105,202002,4,2 +105,202003,3,5 \ No newline at end of file diff --git a/tests/data/pivot_imputation_value_input.csv b/tests/data/pivot_imputation_value_input.csv index c8680c98..d187f8eb 100755 --- a/tests/data/pivot_imputation_value_input.csv +++ b/tests/data/pivot_imputation_value_input.csv @@ -1,34 +1,34 @@ -identifier,date,sic,cell,forward,backward,construction,question -70001,202001,12,100,1.0,1.964796824,107.48,40 -70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40 -70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49 -70001,202003,12,100,1.201086389,0.705052735,98.26519337,49 -70002,202001,12,100,1.0,1.964796824,107.48,40 -70002,202001,12,100,1.326666382,0.599374673,92.06463773,49 -70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40 -70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40 -70003,202001,12,101,1.0,1.964796824,107.48,40 -70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40 -70003,202003,12,101,1.201086389,0.705052735,98.26519337,49 -70004,202001,12,101,1.0,1.964796824,107.48,40 -70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40 -70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40 -70004,202003,12,101,1.201086389,0.705052735,98.26519337,49 -70005,202001,12,102,1.326666382,0.599374673,92.06463773,49 -70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49 -70005,202003,12,102,1.201086389,0.705052735,98.26519337,49 -70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40 -70006,202002,13,103,1.201086389,0.705052735,98.26519337,49 -70006,202003,13,103,1.418333623,0.642249527,93.18857143,49 -70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40 -70007,202001,13,103,1.635246638,1.725354675,143.7564782,49 -70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40 -70007,202002,13,103,1.201086389,0.705052735,98.26519337,49 -70007,202003,13,103,0.913631634,1.0,70.24,40 -70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40 -70008,202002,13,104,1.201086389,0.705052735,98.26519337,49 -70008,202003,13,104,1.418333623,0.642249527,93.18857143,49 -70009,202001,13,104,1.635246638,1.725354675,143.7564782,49 -70009,202002,13,104,1.201086389,0.705052735,98.26519337,49 -70009,202003,13,104,1.418333623,0.642249527,93.18857143,49 -70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40 +identifier,date,sic,cell,forward,backward,construction,question,imputed_value +70001,202001,12,100,1.0,1.964796824,107.48,40,500 +70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40, +70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100 +70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200 +70002,202001,12,100,1.0,1.964796824,107.48,40,150 +70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250 +70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650 +70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800 +70003,202001,12,101,1.0,1.964796824,107.48,40,30 +70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50 +70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170 +70004,202001,12,101,1.0,1.964796824,107.48,40, +70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310 +70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350 +70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170 +70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750 +70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940 +70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520 +70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350 +70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160 +70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380 +70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290 +70007,202001,13,103,1.635246638,1.725354675,143.7564782,49, +70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590 +70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710 +70007,202003,13,103,0.913631634,1.0,70.24,40,280 +70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40, +70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660 +70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220 +70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880 +70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610 +70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90 +70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80 diff --git a/tests/data/pivot_imputation_value_input_2.csv b/tests/data/pivot_imputation_value_input_2.csv new file mode 100755 index 00000000..e3d29b1f --- /dev/null +++ b/tests/data/pivot_imputation_value_input_2.csv @@ -0,0 +1,34 @@ +identifier,date,sic,cell,forward,backward,construction,question,imputed_value,f_count,b_count +70001,202001,12,100,1.0,1.964796824,107.48,40,500,2,3 +70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0 +70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0 +70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200,0,2 +70002,202001,12,100,1.0,1.964796824,107.48,40,150,2,3 +70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250,2,3 +70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0 +70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2 +70003,202001,12,101,1.0,1.964796824,107.48,40,30,5,2 +70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5 +70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3 +70004,202001,12,101,1.0,1.964796824,107.48,40,,5,2 +70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5 +70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3 +70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3 +70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750,6,4 +70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4 +70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520,0,1 +70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8 +70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160,0,2 +70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380,2,1 +70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8 +70007,202001,13,103,1.635246638,1.725354675,143.7564782,49,,7,8 +70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2 +70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710,0,2 +70007,202003,13,103,0.913631634,1.0,70.24,40,280,2,1 +70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5 +70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660,3,6 +70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220,2,3 +70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880,3,5 +70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610,3,6 +70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90,2,3 +70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7 \ No newline at end of file diff --git a/tests/data/pivot_imputation_value_output.csv b/tests/data/pivot_imputation_value_output.csv index 23ebc61e..423f14f1 100755 --- a/tests/data/pivot_imputation_value_output.csv +++ b/tests/data/pivot_imputation_value_output.csv @@ -1,28 +1,28 @@ -date,sic,cell,question,imputation_link,link_type -202001,12,100,40,1.0,F -202001,12,100,40,1.964796824,B -202001,12,100,40,107.48,C -202001,12,100,49,1.326666382,F -202001,12,100,49,0.599374673,B -202001,12,100,49,92.06463773,C -202001,12,101,40,1.0,F -202001,12,101,40,1.964796824,B -202001,12,101,40,107.48,C -202001,12,102,49,1.326666382,F -202001,12,102,49,0.599374673,B -202001,12,102,49,92.06463773,C -202001,13,103,40,0.5089584770000001,F -202001,13,103,40,0.832579579,B -202001,13,103,40,54.70285714,C -202001,13,103,49,1.635246638,F -202001,13,103,49,1.725354675,B -202001,13,103,49,143.7564782,C -202001,13,104,40,0.5089584770000001,F -202001,13,104,40,0.832579579,B -202001,13,104,40,54.70285714,C -202001,13,104,49,1.635246638,F -202001,13,104,49,1.725354675,B -202001,13,104,49,143.7564782,C -202001,13,105,40,0.5089584770000001,F -202001,13,105,40,0.832579579,B -202001,13,105,40,54.70285714,C \ No newline at end of file +date,sic,cell,question,imputation_link,link_type,count,imputed_value +202001,12,100,40,1.0,F,2,650 +202001,12,100,40,1.964796824,B,3,650 +202001,12,100,40,107.48,C,,650 +202001,12,100,49,1.326666382,F,2,250 +202001,12,100,49,0.599374673,B,3,250 +202001,12,100,49,92.06463773,C,,250 +202001,12,101,40,1.0,F,5,30 +202001,12,101,40,1.964796824,B,2,30 +202001,12,101,40,107.48,C,,30 +202001,12,102,49,1.326666382,F,6,750 +202001,12,102,49,0.599374673,B,4,750 +202001,12,102,49,92.06463773,C,,750 +202001,13,103,40,0.5089584770000001,F,7,640 +202001,13,103,40,0.832579579,B,8,640 +202001,13,103,40,54.70285714,C,,640 +202001,13,103,49,1.635246638,F,7,0.0 +202001,13,103,49,1.725354675,B,8,0.0 +202001,13,103,49,143.7564782,C,,0.0 +202001,13,104,40,0.5089584770000001,F,3,0.0 +202001,13,104,40,0.832579579,B,5,0.0 +202001,13,104,40,54.70285714,C,,0.0 +202001,13,104,49,1.635246638,F,3,880 +202001,13,104,49,1.725354675,B,5,880 +202001,13,104,49,143.7564782,C,,880 +202001,13,105,40,0.5089584770000001,F,3,80 +202001,13,105,40,0.832579579,B,7,80 +202001,13,105,40,54.70285714,C,,80 diff --git a/tests/test_pivot_imputation_value.py b/tests/test_pivot_imputation_value.py index 51bf6a2e..0372d1a6 100755 --- a/tests/test_pivot_imputation_value.py +++ b/tests/test_pivot_imputation_value.py @@ -1,10 +1,11 @@ from pathlib import Path +import numpy as np import pandas as pd import pytest from pandas.testing import assert_frame_equal -from mbs_results.pivot_imputation_value import pivot_imputation_value +from mbs_results.pivot_imputation_value import merge_counts, pivot_imputation_value @pytest.fixture(scope="class") @@ -13,41 +14,49 @@ def filepath(): @pytest.fixture(scope="class") -def pivot_imputation_value_input(filepath): +def merge_counts_input(filepath): + input_df = pd.read_csv( + filepath / "data" / "pivot_imputation_value_input.csv", index_col=False + ) + count_df = pd.read_csv( + filepath / "data" / "pivot_imputation_value_counts_input.csv", index_col=False + ) + return (input_df, count_df) + + +@pytest.fixture(scope="class") +def merge_counts_output(filepath): return pd.read_csv( - filepath / "data"/"pivot_imputation_value_input.csv", index_col=False + filepath / "data" / "pivot_imputation_value_input_2.csv", index_col=False ) - + + @pytest.fixture(scope="class") def pivot_imputation_value_output(filepath): return pd.read_csv( - filepath / "data"/"pivot_imputation_value_output.csv", index_col=False + filepath / "data" / "pivot_imputation_value_output.csv", index_col=False ) +class TestMergeCounts: + def test_merge_counts(self, merge_counts_input, merge_counts_output): + input_df, count_df = merge_counts_input + actual_output = merge_counts(input_df, count_df) + expected_output = merge_counts_output + assert_frame_equal(actual_output, expected_output) + + class TestPivotImputationValue: def test_pivot_imputation_value( - self, - pivot_imputation_value_input, - pivot_imputation_value_output + self, pivot_imputation_value_output, merge_counts_output ): - expected_output = pivot_imputation_value_output[ - [ - "date", - "sic", - "cell", - "question", - "link_type", - "imputation_link", - ] - ].reset_index(drop=True) - - expected_output["link_type"] = pd.Categorical(expected_output["link_type"], categories=["F", "B", "C"], ordered=True) - - input_data = pivot_imputation_value_input.drop( - columns=["identifier"] + expected_output = pivot_imputation_value_output.reset_index(drop=True) + expected_output["link_type"] = pd.Categorical( + expected_output["link_type"], categories=["F", "B", "C"], ordered=True ) + input_data = merge_counts_output.drop(columns=["identifier"]) + actual_output = pivot_imputation_value( input_data, "identifier", @@ -58,6 +67,13 @@ def test_pivot_imputation_value( "backward", "construction", "question", + "imputed_value", + "f_count", + "b_count", + ) + + actual_output["link_type"] = pd.Categorical( + actual_output["link_type"], categories=["F", "B", "C"], ordered=True ) assert_frame_equal(actual_output, expected_output) From 6fad8d0ae81d1d65a241b76dc1006823ed64a5f4 Mon Sep 17 00:00:00 2001 From: hubbal Date: Wed, 31 Jul 2024 11:49:13 +0100 Subject: [PATCH 3/8] Changing merge_counts function to join on different column names --- mbs_results/pivot_imputation_value.py | 41 +++++++++++++------ ..._counts_input.csv => count_data_input.csv} | 4 +- ...ue_input_2.csv => merge_counts_output.csv} | 2 +- tests/test_pivot_imputation_value.py | 29 +++++++------ 4 files changed, 45 insertions(+), 31 deletions(-) rename tests/data/{pivot_imputation_value_counts_input.csv => count_data_input.csv} (85%) rename tests/data/{pivot_imputation_value_input_2.csv => merge_counts_output.csv} (99%) diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py index b82c3626..c9fed065 100755 --- a/mbs_results/pivot_imputation_value.py +++ b/mbs_results/pivot_imputation_value.py @@ -2,7 +2,15 @@ import pandas as pd -def merge_counts(input_df: pd.DataFrame, count_df: pd.DataFrame) -> pd.DataFrame: +def merge_counts( + input_df: pd.DataFrame, + count_df: pd.DataFrame, + input_cell: str, + count_cell: str, + input_date: str, + count_date: str, + identifier: str, +) -> pd.DataFrame: """ Returns input data with f_count and b_count merged on. @@ -12,30 +20,37 @@ def merge_counts(input_df: pd.DataFrame, count_df: pd.DataFrame) -> pd.DataFrame Reference dataframe with identifier, date, sic, cell, forward, backward, construction, question,imputed_value count_df : pd.DataFrame DataFrame with group, period, flag_1 and flag_2. - input_cell : + input_cell : str name of column in input_df dataframe containing cell variable - count_cell : + count_cell : str name of column in count_df dataframe containing cell variable - input_date : + input_date : str name of column in input_df dataframe containing date variable - count_date : + count_date : str name of column in count_df dataframe containing date variable - Returns + identifier : str + name of column in input_df containing identifier variable + Returns ------- - Dataframe resulting from the left-join of df_2 and df_1 (after renaming columns), on 'cell' and 'date'. + Dataframe resulting from the left-join of input_df and count_df (after renaming columns), + on the cell and date columns. """ count_df = count_df.rename( columns={ - "group": "cell", - "period": "date", "flag_1": "f_count", "flag_2": "b_count", } ) - df_merge = pd.merge(input_df, count_df, how="left", on=["cell", "date"]) - df_merge["identifier"] = df_merge["identifier"].astype(int) - return df_merge + df_merge = pd.merge( + input_df, + count_df, + how="left", + left_on=[input_cell, input_date], + right_on=[count_cell, count_date], + ).astype({identifier: "int"}) + + return df_merge.drop(columns=[count_cell, count_date]) def pivot_imputation_value( @@ -66,7 +81,7 @@ def pivot_imputation_value( date : str name of column in dataframe containing period variable sic : str - name of column in dataframe containing domain variable + name of column in dataframe containing question code variable cell : str name of column in dataframe containing question code variable forward : str diff --git a/tests/data/pivot_imputation_value_counts_input.csv b/tests/data/count_data_input.csv similarity index 85% rename from tests/data/pivot_imputation_value_counts_input.csv rename to tests/data/count_data_input.csv index 308f65a8..e8df1619 100755 --- a/tests/data/pivot_imputation_value_counts_input.csv +++ b/tests/data/count_data_input.csv @@ -1,4 +1,4 @@ -group,period,flag_1,flag_2 +group,period,f_count,b_count 100,202001,2,3 100,202002,1,0 100,202003,0,2 @@ -16,4 +16,4 @@ group,period,flag_1,flag_2 104,202003,2,3 105,202001,3,7 105,202002,4,2 -105,202003,3,5 \ No newline at end of file +105,202003,3,5 diff --git a/tests/data/pivot_imputation_value_input_2.csv b/tests/data/merge_counts_output.csv similarity index 99% rename from tests/data/pivot_imputation_value_input_2.csv rename to tests/data/merge_counts_output.csv index e3d29b1f..7d384e2a 100755 --- a/tests/data/pivot_imputation_value_input_2.csv +++ b/tests/data/merge_counts_output.csv @@ -31,4 +31,4 @@ identifier,date,sic,cell,forward,backward,construction,question,imputed_value,f_ 70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880,3,5 70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610,3,6 70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90,2,3 -70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7 \ No newline at end of file +70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7 diff --git a/tests/test_pivot_imputation_value.py b/tests/test_pivot_imputation_value.py index 0372d1a6..51981ebd 100755 --- a/tests/test_pivot_imputation_value.py +++ b/tests/test_pivot_imputation_value.py @@ -14,21 +14,13 @@ def filepath(): @pytest.fixture(scope="class") -def merge_counts_input(filepath): - input_df = pd.read_csv( - filepath / "data" / "pivot_imputation_value_input.csv", index_col=False - ) - count_df = pd.read_csv( - filepath / "data" / "pivot_imputation_value_counts_input.csv", index_col=False - ) - return (input_df, count_df) +def count_data_input(filepath): + return pd.read_csv(filepath / "data" / "count_data_input.csv", index_col=False) @pytest.fixture(scope="class") def merge_counts_output(filepath): - return pd.read_csv( - filepath / "data" / "pivot_imputation_value_input_2.csv", index_col=False - ) + return pd.read_csv(filepath / "data" / "merge_counts_output.csv", index_col=False) @pytest.fixture(scope="class") @@ -39,10 +31,15 @@ def pivot_imputation_value_output(filepath): class TestMergeCounts: - def test_merge_counts(self, merge_counts_input, merge_counts_output): - input_df, count_df = merge_counts_input - actual_output = merge_counts(input_df, count_df) + def test_merge_counts(self, count_data_input, merge_counts_output): + + input_df = merge_counts_output.drop(columns=["f_count", "b_count"]) + + actual_output = merge_counts( + input_df, count_data_input, "cell", "group", "date", "period", "identifier" + ) expected_output = merge_counts_output + assert_frame_equal(actual_output, expected_output) @@ -50,7 +47,9 @@ class TestPivotImputationValue: def test_pivot_imputation_value( self, pivot_imputation_value_output, merge_counts_output ): - expected_output = pivot_imputation_value_output.reset_index(drop=True) + + expected_output = pivot_imputation_value_output + expected_output["link_type"] = pd.Categorical( expected_output["link_type"], categories=["F", "B", "C"], ordered=True ) From 80a949a3039419493547eaee9a6e325d032c82bc Mon Sep 17 00:00:00 2001 From: hubbal Date: Wed, 31 Jul 2024 13:24:47 +0100 Subject: [PATCH 4/8] Adding variable for date selection in pivot_imputation_value and removing sic --- mbs_results/pivot_imputation_value.py | 60 ++++++++------- tests/data/merge_counts_output.csv | 68 ++++++++--------- tests/data/pivot_imputation_value_input.csv | 34 --------- tests/data/pivot_imputation_value_output.csv | 77 +++++++++++++------- tests/test_pivot_imputation_value.py | 36 ++++++--- 5 files changed, 138 insertions(+), 137 deletions(-) delete mode 100755 tests/data/pivot_imputation_value_input.csv diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py index c9fed065..e06abbea 100755 --- a/mbs_results/pivot_imputation_value.py +++ b/mbs_results/pivot_imputation_value.py @@ -1,4 +1,3 @@ -import numpy as np import pandas as pd @@ -17,7 +16,8 @@ def merge_counts( Parameters ---------- input_df : pd.DataFrame - Reference dataframe with identifier, date, sic, cell, forward, backward, construction, question,imputed_value + Reference dataframe with identifier, date, sic, cell, forward, backward, + construction, question, imputed_value count_df : pd.DataFrame DataFrame with group, period, flag_1 and flag_2. input_cell : str @@ -33,8 +33,8 @@ def merge_counts( Returns ------- - Dataframe resulting from the left-join of input_df and count_df (after renaming columns), - on the cell and date columns. + Dataframe resulting from the left-join of input_df and count_df on the cell and + date columns. """ count_df = count_df.rename( columns={ @@ -57,7 +57,6 @@ def pivot_imputation_value( df: pd.DataFrame, identifier: str, date: str, - sic: str, cell: str, forward: str, backward: str, @@ -66,32 +65,32 @@ def pivot_imputation_value( imputed_value: str, f_count: str, b_count: str, + selected_periods: list = None, ) -> pd.DataFrame: """ - Returning dataframe containing imputation_value, filtered by date, pivoted by imputation type - and grouped by sic, cell, question and imputation type. + Returning dataframe containing imputation_value, filtered by date, pivoted by + imputation type and grouped by sic, cell, question and imputation type. Parameters ---------- dataframe : pd.DataFrame - Reference dataframe with domain, a_weights, o_weights, and g_weights + Reference dataframe containing links, count values, and imputed values + by identifier, cell, date, and question identifier : str name of column in dataframe containing identifier variable date : str - name of column in dataframe containing period variable - sic : str - name of column in dataframe containing question code variable + name of column in dataframe containing date variable cell : str - name of column in dataframe containing question code variable + name of column in dataframe containing cell variable forward : str - name of column in dataframe containing predicted value variable + name of column in dataframe containing forward link variable backward : str - name of column in dataframe containing imputation marker variable + name of column in dataframe containing backward link variable construction : str - name of column in dataframe containing a_weight variable + name of column in dataframe containing construction link variable question : str - name of column in dataframe containing o_weight variable + name of column in dataframe containing question code variable imputed_value: str name of column in dataframe containing imputed_value variable f_count: str, @@ -105,48 +104,48 @@ def pivot_imputation_value( and grouped by sic, cell, question and imputation type. """ + if selected_periods is not None: + df = df.query("{} in {}".format(date, selected_periods)) - df = df[df[date] == 202001] - - df1 = df.melt( - id_vars=[date, sic, cell, question, imputed_value], + links_df = df.melt( + id_vars=[date, cell, question, imputed_value], value_vars=[forward, backward, construction], var_name="link_type", value_name="imputation_link", ) link_type_map = {forward: "F", backward: "B", construction: "C"} - df1["link_type"] = df1["link_type"].map(link_type_map) + links_df["link_type"] = links_df["link_type"].map(link_type_map) - df2 = df.melt( - id_vars=[date, sic, cell, question], + counts_df = df.melt( + id_vars=[date, cell, question], value_vars=[f_count, b_count], var_name="link_type_count", value_name="count", ) link_type_map_count = {f_count: "F", b_count: "B"} - df2["link_type_count"] = df2["link_type_count"].map(link_type_map_count) + counts_df["link_type_count"] = counts_df["link_type_count"].map(link_type_map_count) merged_df = pd.merge( - df1, - df2, + links_df, + counts_df, how="outer", - left_on=[date, sic, cell, question, "link_type"], - right_on=[date, sic, cell, question, "link_type_count"], + left_on=[date, cell, question, "link_type"], + right_on=[date, cell, question, "link_type_count"], ) merged_df.drop_duplicates(inplace=True) merged_df.drop(["link_type_count"], axis=1, inplace=True) merged_df = merged_df.groupby( - [date, sic, cell, question, "link_type"], as_index=False + [date, cell, question, "link_type"], as_index=False ).agg({imputed_value: "sum", "count": "first", "imputation_link": "first"}) sorting_order = {"F": 1, "B": 2, "C": 3} merged_df["sort_column"] = merged_df["link_type"].map(sorting_order) - merged_df = merged_df.sort_values([date, sic, cell, question, "sort_column"]) + merged_df = merged_df.sort_values([date, cell, question, "sort_column"]) merged_df.drop("sort_column", axis=1, inplace=True) @@ -155,7 +154,6 @@ def pivot_imputation_value( merged_df = merged_df[ [ date, - sic, cell, question, "imputation_link", diff --git a/tests/data/merge_counts_output.csv b/tests/data/merge_counts_output.csv index 7d384e2a..fb1f2648 100755 --- a/tests/data/merge_counts_output.csv +++ b/tests/data/merge_counts_output.csv @@ -1,34 +1,34 @@ -identifier,date,sic,cell,forward,backward,construction,question,imputed_value,f_count,b_count -70001,202001,12,100,1.0,1.964796824,107.48,40,500,2,3 -70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0 -70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0 -70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200,0,2 -70002,202001,12,100,1.0,1.964796824,107.48,40,150,2,3 -70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250,2,3 -70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0 -70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2 -70003,202001,12,101,1.0,1.964796824,107.48,40,30,5,2 -70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5 -70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3 -70004,202001,12,101,1.0,1.964796824,107.48,40,,5,2 -70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5 -70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3 -70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3 -70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750,6,4 -70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4 -70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520,0,1 -70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8 -70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160,0,2 -70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380,2,1 -70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8 -70007,202001,13,103,1.635246638,1.725354675,143.7564782,49,,7,8 -70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2 -70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710,0,2 -70007,202003,13,103,0.913631634,1.0,70.24,40,280,2,1 -70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5 -70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660,3,6 -70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220,2,3 -70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880,3,5 -70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610,3,6 -70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90,2,3 -70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7 +identifier,date,cell,forward,backward,construction,question,imputed_value,f_count,b_count +70001,202001,100,1.0,1.964796824,107.48,40,500,2,3 +70001,202002,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0 +70001,202002,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0 +70001,202003,100,1.201086389,0.705052735,98.26519337,49,200,0,2 +70002,202001,100,1.0,1.964796824,107.48,40,150,2,3 +70002,202001,100,1.326666382,0.599374673,92.06463773,49,250,2,3 +70002,202002,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0 +70002,202003,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2 +70003,202001,101,1.0,1.964796824,107.48,40,30,5,2 +70003,202002,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5 +70003,202003,101,1.201086389,0.705052735,98.26519337,49,170,3,3 +70004,202001,101,1.0,1.964796824,107.48,40,,5,2 +70004,202002,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5 +70004,202003,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3 +70004,202003,101,1.201086389,0.705052735,98.26519337,49,170,3,3 +70005,202001,102,1.326666382,0.599374673,92.06463773,49,750,6,4 +70005,202002,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4 +70005,202003,102,1.201086389,0.705052735,98.26519337,49,520,0,1 +70006,202001,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8 +70006,202002,103,1.201086389,0.705052735,98.26519337,49,160,0,2 +70006,202003,103,1.418333623,0.642249527,93.18857143,49,380,2,1 +70007,202001,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8 +70007,202001,103,1.635246638,1.725354675,143.7564782,49,,7,8 +70007,202002,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2 +70007,202002,103,1.201086389,0.705052735,98.26519337,49,710,0,2 +70007,202003,103,0.913631634,1.0,70.24,40,280,2,1 +70008,202001,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5 +70008,202002,104,1.201086389,0.705052735,98.26519337,49,660,3,6 +70008,202003,104,1.418333623,0.642249527,93.18857143,49,220,2,3 +70009,202001,104,1.635246638,1.725354675,143.7564782,49,880,3,5 +70009,202002,104,1.201086389,0.705052735,98.26519337,49,610,3,6 +70009,202003,104,1.418333623,0.642249527,93.18857143,49,90,2,3 +70010,202001,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7 diff --git a/tests/data/pivot_imputation_value_input.csv b/tests/data/pivot_imputation_value_input.csv deleted file mode 100755 index d187f8eb..00000000 --- a/tests/data/pivot_imputation_value_input.csv +++ /dev/null @@ -1,34 +0,0 @@ -identifier,date,sic,cell,forward,backward,construction,question,imputed_value -70001,202001,12,100,1.0,1.964796824,107.48,40,500 -70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40, -70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100 -70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200 -70002,202001,12,100,1.0,1.964796824,107.48,40,150 -70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250 -70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650 -70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800 -70003,202001,12,101,1.0,1.964796824,107.48,40,30 -70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50 -70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170 -70004,202001,12,101,1.0,1.964796824,107.48,40, -70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310 -70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350 -70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170 -70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750 -70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940 -70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520 -70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350 -70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160 -70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380 -70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290 -70007,202001,13,103,1.635246638,1.725354675,143.7564782,49, -70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590 -70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710 -70007,202003,13,103,0.913631634,1.0,70.24,40,280 -70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40, -70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660 -70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220 -70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880 -70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610 -70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90 -70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80 diff --git a/tests/data/pivot_imputation_value_output.csv b/tests/data/pivot_imputation_value_output.csv index 423f14f1..456e6890 100755 --- a/tests/data/pivot_imputation_value_output.csv +++ b/tests/data/pivot_imputation_value_output.csv @@ -1,28 +1,49 @@ -date,sic,cell,question,imputation_link,link_type,count,imputed_value -202001,12,100,40,1.0,F,2,650 -202001,12,100,40,1.964796824,B,3,650 -202001,12,100,40,107.48,C,,650 -202001,12,100,49,1.326666382,F,2,250 -202001,12,100,49,0.599374673,B,3,250 -202001,12,100,49,92.06463773,C,,250 -202001,12,101,40,1.0,F,5,30 -202001,12,101,40,1.964796824,B,2,30 -202001,12,101,40,107.48,C,,30 -202001,12,102,49,1.326666382,F,6,750 -202001,12,102,49,0.599374673,B,4,750 -202001,12,102,49,92.06463773,C,,750 -202001,13,103,40,0.5089584770000001,F,7,640 -202001,13,103,40,0.832579579,B,8,640 -202001,13,103,40,54.70285714,C,,640 -202001,13,103,49,1.635246638,F,7,0.0 -202001,13,103,49,1.725354675,B,8,0.0 -202001,13,103,49,143.7564782,C,,0.0 -202001,13,104,40,0.5089584770000001,F,3,0.0 -202001,13,104,40,0.832579579,B,5,0.0 -202001,13,104,40,54.70285714,C,,0.0 -202001,13,104,49,1.635246638,F,3,880 -202001,13,104,49,1.725354675,B,5,880 -202001,13,104,49,143.7564782,C,,880 -202001,13,105,40,0.5089584770000001,F,3,80 -202001,13,105,40,0.832579579,B,7,80 -202001,13,105,40,54.70285714,C,,80 +date,cell,question,imputation_link,link_type,count,imputed_value +202001,100,40,1.0,F,2,650 +202001,100,40,1.964796824,B,3,650 +202001,100,40,107.48,C,,650 +202001,100,49,1.326666382,F,2,250 +202001,100,49,0.599374673,B,3,250 +202001,100,49,92.06463773,C,,250 +202001,101,40,1.0,F,5,30 +202001,101,40,1.964796824,B,2,30 +202001,101,40,107.48,C,,30 +202001,102,49,1.326666382,F,6,750 +202001,102,49,0.599374673,B,4,750 +202001,102,49,92.06463773,C,,750 +202001,103,40,0.5089584770000001,F,7,640 +202001,103,40,0.832579579,B,8,640 +202001,103,40,54.70285714,C,,640 +202001,103,49,1.635246638,F,7,0.0 +202001,103,49,1.725354675,B,8,0.0 +202001,103,49,143.7564782,C,,0.0 +202001,104,40,0.5089584770000001,F,3,0.0 +202001,104,40,0.832579579,B,5,0.0 +202001,104,40,54.70285714,C,,0.0 +202001,104,49,1.635246638,F,3,880 +202001,104,49,1.725354675,B,5,880 +202001,104,49,143.7564782,C,,880 +202001,105,40,0.5089584770000001,F,3,80 +202001,105,40,0.832579579,B,7,80 +202001,105,40,54.70285714,C,,80 +202002,100,40,0.5089584770000001,F,1,650 +202002,100,40,0.832579579,B,0,650 +202002,100,40,54.70285714,C,,650 +202002,100,49,0.5016253708279402,F,1,100 +202002,100,49,0.812637487,B,0,100 +202002,100,49,36.63728374,C,,100 +202002,101,40,0.5089584770000001,F,2,360 +202002,101,40,0.832579579,B,5,360 +202002,101,40,54.70285714,C,,360 +202002,102,49,0.5016253708279402,F,1,940 +202002,102,49,0.812637487,B,4,940 +202002,102,49,36.63728374,C,,940 +202002,103,40,0.529851921,F,0,590 +202002,103,40,1.09453303,B,2,590 +202002,103,40,106.79005520000001,C,,590 +202002,103,49,1.201086389,F,0,870 +202002,103,49,0.705052735,B,2,870 +202002,103,49,98.26519337,C,,870 +202002,104,49,1.201086389,F,3,1270 +202002,104,49,0.705052735,B,6,1270 +202002,104,49,98.26519337,C,,1270 diff --git a/tests/test_pivot_imputation_value.py b/tests/test_pivot_imputation_value.py index 51981ebd..4f14d95c 100755 --- a/tests/test_pivot_imputation_value.py +++ b/tests/test_pivot_imputation_value.py @@ -1,6 +1,5 @@ from pathlib import Path -import numpy as np import pandas as pd import pytest from pandas.testing import assert_frame_equal @@ -44,15 +43,11 @@ def test_merge_counts(self, count_data_input, merge_counts_output): class TestPivotImputationValue: - def test_pivot_imputation_value( + def test_pivot_imputation_value_filter( self, pivot_imputation_value_output, merge_counts_output ): - expected_output = pivot_imputation_value_output - - expected_output["link_type"] = pd.Categorical( - expected_output["link_type"], categories=["F", "B", "C"], ordered=True - ) + expected_output = pivot_imputation_value_output.query("date == 202001") input_data = merge_counts_output.drop(columns=["identifier"]) @@ -60,7 +55,6 @@ def test_pivot_imputation_value( input_data, "identifier", "date", - "sic", "cell", "forward", "backward", @@ -69,10 +63,32 @@ def test_pivot_imputation_value( "imputed_value", "f_count", "b_count", + [202001], ) - actual_output["link_type"] = pd.Categorical( - actual_output["link_type"], categories=["F", "B", "C"], ordered=True + assert_frame_equal(actual_output, expected_output) + + def test_pivot_imputation_value_no_filter( + self, pivot_imputation_value_output, merge_counts_output + ): + + expected_output = pivot_imputation_value_output + + input_data = merge_counts_output.drop(columns=["identifier"]) + input_data = input_data.query("date in [202001, 202002]") + + actual_output = pivot_imputation_value( + input_data, + "identifier", + "date", + "cell", + "forward", + "backward", + "construction", + "question", + "imputed_value", + "f_count", + "b_count", ) assert_frame_equal(actual_output, expected_output) From 1be9d87831c0093d149d780d4bdce8544204274e Mon Sep 17 00:00:00 2001 From: hubbal Date: Wed, 31 Jul 2024 13:28:17 +0100 Subject: [PATCH 5/8] Adding selected_periods argument to docstring --- mbs_results/pivot_imputation_value.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py index e06abbea..9fe71c79 100755 --- a/mbs_results/pivot_imputation_value.py +++ b/mbs_results/pivot_imputation_value.py @@ -97,6 +97,8 @@ def pivot_imputation_value( name of column in dataframe containing f_count variable b_count: str, name of column in dataframe containing b_count variable + selected_periods: list, + list containing periods to include in output Returns ------- From bad6c8e4a4a240b1014781df336fa0bd48ee2734 Mon Sep 17 00:00:00 2001 From: matthk Date: Wed, 31 Jul 2024 16:19:57 +0100 Subject: [PATCH 6/8] removing rename statement in merge_counts --- mbs_results/pivot_imputation_value.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py index 9fe71c79..faaaadfe 100755 --- a/mbs_results/pivot_imputation_value.py +++ b/mbs_results/pivot_imputation_value.py @@ -19,7 +19,7 @@ def merge_counts( Reference dataframe with identifier, date, sic, cell, forward, backward, construction, question, imputed_value count_df : pd.DataFrame - DataFrame with group, period, flag_1 and flag_2. + DataFrame with group, period, f_count and b_count input_cell : str name of column in input_df dataframe containing cell variable count_cell : str @@ -36,12 +36,6 @@ def merge_counts( Dataframe resulting from the left-join of input_df and count_df on the cell and date columns. """ - count_df = count_df.rename( - columns={ - "flag_1": "f_count", - "flag_2": "b_count", - } - ) df_merge = pd.merge( input_df, count_df, From f246a25f912095795b1a6c0d4cbb076c5c97b1f3 Mon Sep 17 00:00:00 2001 From: matthk Date: Thu, 1 Aug 2024 12:02:54 +0100 Subject: [PATCH 7/8] pivot_imputation_value including sic and counstruction counts --- mbs_results/pivot_imputation_value.py | 23 +++-- tests/data/count_data_input.csv | 38 ++++---- tests/data/merge_counts_output.csv | 68 +++++++------- tests/data/pivot_imputation_value_output.csv | 98 ++++++++++---------- tests/test_pivot_imputation_value.py | 6 +- 5 files changed, 122 insertions(+), 111 deletions(-) diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py index faaaadfe..71290b69 100755 --- a/mbs_results/pivot_imputation_value.py +++ b/mbs_results/pivot_imputation_value.py @@ -51,6 +51,7 @@ def pivot_imputation_value( df: pd.DataFrame, identifier: str, date: str, + sic: str, cell: str, forward: str, backward: str, @@ -59,6 +60,7 @@ def pivot_imputation_value( imputed_value: str, f_count: str, b_count: str, + c_count: str, selected_periods: list = None, ) -> pd.DataFrame: @@ -75,6 +77,8 @@ def pivot_imputation_value( name of column in dataframe containing identifier variable date : str name of column in dataframe containing date variable + sic : str + name of column in dataframe containing sic variable cell : str name of column in dataframe containing cell variable forward : str @@ -91,6 +95,8 @@ def pivot_imputation_value( name of column in dataframe containing f_count variable b_count: str, name of column in dataframe containing b_count variable + c_count: str, + name of column in dataframe containing c_count variable selected_periods: list, list containing periods to include in output @@ -104,7 +110,7 @@ def pivot_imputation_value( df = df.query("{} in {}".format(date, selected_periods)) links_df = df.melt( - id_vars=[date, cell, question, imputed_value], + id_vars=[date, sic, cell, question, imputed_value], value_vars=[forward, backward, construction], var_name="link_type", value_name="imputation_link", @@ -114,34 +120,34 @@ def pivot_imputation_value( links_df["link_type"] = links_df["link_type"].map(link_type_map) counts_df = df.melt( - id_vars=[date, cell, question], - value_vars=[f_count, b_count], + id_vars=[date, sic, cell, question], + value_vars=[f_count, b_count, c_count], var_name="link_type_count", value_name="count", ) - link_type_map_count = {f_count: "F", b_count: "B"} + link_type_map_count = {f_count: "F", b_count: "B", c_count: "C"} counts_df["link_type_count"] = counts_df["link_type_count"].map(link_type_map_count) merged_df = pd.merge( links_df, counts_df, how="outer", - left_on=[date, cell, question, "link_type"], - right_on=[date, cell, question, "link_type_count"], + left_on=[date, sic, cell, question, "link_type"], + right_on=[date, sic, cell, question, "link_type_count"], ) merged_df.drop_duplicates(inplace=True) merged_df.drop(["link_type_count"], axis=1, inplace=True) merged_df = merged_df.groupby( - [date, cell, question, "link_type"], as_index=False + [date, sic, cell, question, "link_type"], as_index=False ).agg({imputed_value: "sum", "count": "first", "imputation_link": "first"}) sorting_order = {"F": 1, "B": 2, "C": 3} merged_df["sort_column"] = merged_df["link_type"].map(sorting_order) - merged_df = merged_df.sort_values([date, cell, question, "sort_column"]) + merged_df = merged_df.sort_values([date, sic, cell, question, "sort_column"]) merged_df.drop("sort_column", axis=1, inplace=True) @@ -150,6 +156,7 @@ def pivot_imputation_value( merged_df = merged_df[ [ date, + sic, cell, question, "imputation_link", diff --git a/tests/data/count_data_input.csv b/tests/data/count_data_input.csv index e8df1619..730a6ddf 100755 --- a/tests/data/count_data_input.csv +++ b/tests/data/count_data_input.csv @@ -1,19 +1,19 @@ -group,period,f_count,b_count -100,202001,2,3 -100,202002,1,0 -100,202003,0,2 -101,202001,5,2 -101,202002,2,5 -101,202003,3,3 -102,202001,6,4 -102,202002,1,4 -102,202003,0,1 -103,202001,7,8 -103,202002,0,2 -103,202003,2,1 -104,202001,3,5 -104,202002,3,6 -104,202003,2,3 -105,202001,3,7 -105,202002,4,2 -105,202003,3,5 +group,period,f_count,b_count,c_count +100,202001,2,3,1 +100,202002,1,0,2 +100,202003,0,2,3 +101,202001,5,2,2 +101,202002,2,5,3 +101,202003,3,3,4 +102,202001,6,4,3 +102,202002,1,4,4 +102,202003,0,1,5 +103,202001,7,8,4 +103,202002,0,2,5 +103,202003,2,1,6 +104,202001,3,5,5 +104,202002,3,6,6 +104,202003,2,3,7 +105,202001,3,7,6 +105,202002,4,2,7 +105,202003,3,5,8 diff --git a/tests/data/merge_counts_output.csv b/tests/data/merge_counts_output.csv index fb1f2648..0722abaf 100755 --- a/tests/data/merge_counts_output.csv +++ b/tests/data/merge_counts_output.csv @@ -1,34 +1,34 @@ -identifier,date,cell,forward,backward,construction,question,imputed_value,f_count,b_count -70001,202001,100,1.0,1.964796824,107.48,40,500,2,3 -70001,202002,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0 -70001,202002,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0 -70001,202003,100,1.201086389,0.705052735,98.26519337,49,200,0,2 -70002,202001,100,1.0,1.964796824,107.48,40,150,2,3 -70002,202001,100,1.326666382,0.599374673,92.06463773,49,250,2,3 -70002,202002,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0 -70002,202003,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2 -70003,202001,101,1.0,1.964796824,107.48,40,30,5,2 -70003,202002,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5 -70003,202003,101,1.201086389,0.705052735,98.26519337,49,170,3,3 -70004,202001,101,1.0,1.964796824,107.48,40,,5,2 -70004,202002,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5 -70004,202003,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3 -70004,202003,101,1.201086389,0.705052735,98.26519337,49,170,3,3 -70005,202001,102,1.326666382,0.599374673,92.06463773,49,750,6,4 -70005,202002,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4 -70005,202003,102,1.201086389,0.705052735,98.26519337,49,520,0,1 -70006,202001,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8 -70006,202002,103,1.201086389,0.705052735,98.26519337,49,160,0,2 -70006,202003,103,1.418333623,0.642249527,93.18857143,49,380,2,1 -70007,202001,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8 -70007,202001,103,1.635246638,1.725354675,143.7564782,49,,7,8 -70007,202002,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2 -70007,202002,103,1.201086389,0.705052735,98.26519337,49,710,0,2 -70007,202003,103,0.913631634,1.0,70.24,40,280,2,1 -70008,202001,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5 -70008,202002,104,1.201086389,0.705052735,98.26519337,49,660,3,6 -70008,202003,104,1.418333623,0.642249527,93.18857143,49,220,2,3 -70009,202001,104,1.635246638,1.725354675,143.7564782,49,880,3,5 -70009,202002,104,1.201086389,0.705052735,98.26519337,49,610,3,6 -70009,202003,104,1.418333623,0.642249527,93.18857143,49,90,2,3 -70010,202001,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7 +identifier,date,sic,cell,forward,backward,construction,question,imputed_value,f_count,b_count,c_count +70001,202001,12,100,1.0,1.964796824,107.48,40,500,2,3,1 +70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0,2 +70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0,2 +70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200,0,2,3 +70002,202001,12,100,1.0,1.964796824,107.48,40,150,2,3,1 +70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250,2,3,1 +70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0,2 +70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2,3 +70003,202001,12,101,1.0,1.964796824,107.48,40,30,5,2,2 +70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5,3 +70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4 +70004,202001,12,101,1.0,1.964796824,107.48,40,,5,2,2 +70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5,3 +70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3,4 +70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4 +70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750,6,4,3 +70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4,4 +70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520,0,1,5 +70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8,4 +70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160,0,2,5 +70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380,2,1,6 +70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8,4 +70007,202001,13,103,1.635246638,1.725354675,143.7564782,49,,7,8,4 +70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2,5 +70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710,0,2,5 +70007,202003,13,103,0.913631634,1.0,70.24,40,280,2,1,6 +70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5,5 +70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660,3,6,6 +70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220,2,3,7 +70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880,3,5,5 +70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610,3,6,6 +70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90,2,3,7 +70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7,6 diff --git a/tests/data/pivot_imputation_value_output.csv b/tests/data/pivot_imputation_value_output.csv index 456e6890..17e9f93c 100755 --- a/tests/data/pivot_imputation_value_output.csv +++ b/tests/data/pivot_imputation_value_output.csv @@ -1,49 +1,49 @@ -date,cell,question,imputation_link,link_type,count,imputed_value -202001,100,40,1.0,F,2,650 -202001,100,40,1.964796824,B,3,650 -202001,100,40,107.48,C,,650 -202001,100,49,1.326666382,F,2,250 -202001,100,49,0.599374673,B,3,250 -202001,100,49,92.06463773,C,,250 -202001,101,40,1.0,F,5,30 -202001,101,40,1.964796824,B,2,30 -202001,101,40,107.48,C,,30 -202001,102,49,1.326666382,F,6,750 -202001,102,49,0.599374673,B,4,750 -202001,102,49,92.06463773,C,,750 -202001,103,40,0.5089584770000001,F,7,640 -202001,103,40,0.832579579,B,8,640 -202001,103,40,54.70285714,C,,640 -202001,103,49,1.635246638,F,7,0.0 -202001,103,49,1.725354675,B,8,0.0 -202001,103,49,143.7564782,C,,0.0 -202001,104,40,0.5089584770000001,F,3,0.0 -202001,104,40,0.832579579,B,5,0.0 -202001,104,40,54.70285714,C,,0.0 -202001,104,49,1.635246638,F,3,880 -202001,104,49,1.725354675,B,5,880 -202001,104,49,143.7564782,C,,880 -202001,105,40,0.5089584770000001,F,3,80 -202001,105,40,0.832579579,B,7,80 -202001,105,40,54.70285714,C,,80 -202002,100,40,0.5089584770000001,F,1,650 -202002,100,40,0.832579579,B,0,650 -202002,100,40,54.70285714,C,,650 -202002,100,49,0.5016253708279402,F,1,100 -202002,100,49,0.812637487,B,0,100 -202002,100,49,36.63728374,C,,100 -202002,101,40,0.5089584770000001,F,2,360 -202002,101,40,0.832579579,B,5,360 -202002,101,40,54.70285714,C,,360 -202002,102,49,0.5016253708279402,F,1,940 -202002,102,49,0.812637487,B,4,940 -202002,102,49,36.63728374,C,,940 -202002,103,40,0.529851921,F,0,590 -202002,103,40,1.09453303,B,2,590 -202002,103,40,106.79005520000001,C,,590 -202002,103,49,1.201086389,F,0,870 -202002,103,49,0.705052735,B,2,870 -202002,103,49,98.26519337,C,,870 -202002,104,49,1.201086389,F,3,1270 -202002,104,49,0.705052735,B,6,1270 -202002,104,49,98.26519337,C,,1270 +date,sic,cell,question,imputation_link,link_type,count,imputed_value +202001,12,100,40,1.0,F,2,650 +202001,12,100,40,1.964796824,B,3,650 +202001,12,100,40,107.48,C,1,650 +202001,12,100,49,1.326666382,F,2,250 +202001,12,100,49,0.599374673,B,3,250 +202001,12,100,49,92.06463773,C,1,250 +202001,12,101,40,1.0,F,5,30 +202001,12,101,40,1.964796824,B,2,30 +202001,12,101,40,107.48,C,2,30 +202001,12,102,49,1.326666382,F,6,750 +202001,12,102,49,0.599374673,B,4,750 +202001,12,102,49,92.06463773,C,3,750 +202001,13,103,40,0.5089584770000001,F,7,640 +202001,13,103,40,0.832579579,B,8,640 +202001,13,103,40,54.70285714,C,4,640 +202001,13,103,49,1.635246638,F,7,0.0 +202001,13,103,49,1.725354675,B,8,0.0 +202001,13,103,49,143.7564782,C,4,0.0 +202001,13,104,40,0.5089584770000001,F,3,0.0 +202001,13,104,40,0.832579579,B,5,0.0 +202001,13,104,40,54.70285714,C,5,0.0 +202001,13,104,49,1.635246638,F,3,880 +202001,13,104,49,1.725354675,B,5,880 +202001,13,104,49,143.7564782,C,5,880 +202001,13,105,40,0.5089584770000001,F,3,80 +202001,13,105,40,0.832579579,B,7,80 +202001,13,105,40,54.70285714,C,6,80 +202002,12,100,40,0.5089584770000001,F,1,650 +202002,12,100,40,0.832579579,B,0,650 +202002,12,100,40,54.70285714,C,2,650 +202002,12,100,49,0.5016253708279402,F,1,100 +202002,12,100,49,0.812637487,B,0,100 +202002,12,100,49,36.63728374,C,2,100 +202002,12,101,40,0.5089584770000001,F,2,360 +202002,12,101,40,0.832579579,B,5,360 +202002,12,101,40,54.70285714,C,3,360 +202002,12,102,49,0.5016253708279402,F,1,940 +202002,12,102,49,0.812637487,B,4,940 +202002,12,102,49,36.63728374,C,4,940 +202002,13,103,40,0.529851921,F,0,590 +202002,13,103,40,1.09453303,B,2,590 +202002,13,103,40,106.79005520000001,C,5,590 +202002,13,103,49,1.201086389,F,0,870 +202002,13,103,49,0.705052735,B,2,870 +202002,13,103,49,98.26519337,C,5,870 +202002,13,104,49,1.201086389,F,3,1270 +202002,13,104,49,0.705052735,B,6,1270 +202002,13,104,49,98.26519337,C,6,1270 diff --git a/tests/test_pivot_imputation_value.py b/tests/test_pivot_imputation_value.py index 4f14d95c..fe4b7f34 100755 --- a/tests/test_pivot_imputation_value.py +++ b/tests/test_pivot_imputation_value.py @@ -32,7 +32,7 @@ def pivot_imputation_value_output(filepath): class TestMergeCounts: def test_merge_counts(self, count_data_input, merge_counts_output): - input_df = merge_counts_output.drop(columns=["f_count", "b_count"]) + input_df = merge_counts_output.drop(columns=["f_count", "b_count", "c_count"]) actual_output = merge_counts( input_df, count_data_input, "cell", "group", "date", "period", "identifier" @@ -55,6 +55,7 @@ def test_pivot_imputation_value_filter( input_data, "identifier", "date", + "sic", "cell", "forward", "backward", @@ -63,6 +64,7 @@ def test_pivot_imputation_value_filter( "imputed_value", "f_count", "b_count", + "c_count", [202001], ) @@ -81,6 +83,7 @@ def test_pivot_imputation_value_no_filter( input_data, "identifier", "date", + "sic", "cell", "forward", "backward", @@ -89,6 +92,7 @@ def test_pivot_imputation_value_no_filter( "imputed_value", "f_count", "b_count", + "c_count", ) assert_frame_equal(actual_output, expected_output) From 1badbdfd0e22d94a428945d335d973fb122c4526 Mon Sep 17 00:00:00 2001 From: hubbal Date: Wed, 14 Aug 2024 11:42:38 +0100 Subject: [PATCH 8/8] Changing arguments in pivot_imputation_value to lists --- mbs_results/pivot_imputation_value.py | 74 +++++++++------------------ tests/test_pivot_imputation_value.py | 26 +++------- 2 files changed, 31 insertions(+), 69 deletions(-) diff --git a/mbs_results/pivot_imputation_value.py b/mbs_results/pivot_imputation_value.py index 71290b69..91e08a0c 100755 --- a/mbs_results/pivot_imputation_value.py +++ b/mbs_results/pivot_imputation_value.py @@ -50,17 +50,10 @@ def merge_counts( def pivot_imputation_value( df: pd.DataFrame, identifier: str, - date: str, - sic: str, - cell: str, - forward: str, - backward: str, - construction: str, - question: str, + groups: list, + link_columns: list, + count_columns: list, imputed_value: str, - f_count: str, - b_count: str, - c_count: str, selected_periods: list = None, ) -> pd.DataFrame: @@ -75,29 +68,15 @@ def pivot_imputation_value( by identifier, cell, date, and question identifier : str name of column in dataframe containing identifier variable - date : str - name of column in dataframe containing date variable - sic : str - name of column in dataframe containing sic variable - cell : str - name of column in dataframe containing cell variable - forward : str - name of column in dataframe containing forward link variable - backward : str - name of column in dataframe containing backward link variable - construction : str - name of column in dataframe containing construction link variable - question : str - name of column in dataframe containing question code variable + groups : list + + link_columns : list + + count_columns : list + imputed_value: str name of column in dataframe containing imputed_value variable - f_count: str, - name of column in dataframe containing f_count variable - b_count: str, - name of column in dataframe containing b_count variable - c_count: str, - name of column in dataframe containing c_count variable - selected_periods: list, + selected_periods: list list containing periods to include in output Returns @@ -107,58 +86,55 @@ def pivot_imputation_value( """ if selected_periods is not None: - df = df.query("{} in {}".format(date, selected_periods)) + df = df.query("{} in {}".format(groups[0], selected_periods)) links_df = df.melt( - id_vars=[date, sic, cell, question, imputed_value], - value_vars=[forward, backward, construction], + id_vars=groups + [imputed_value], + value_vars=link_columns, var_name="link_type", value_name="imputation_link", ) - link_type_map = {forward: "F", backward: "B", construction: "C"} + link_type_map = dict(zip(link_columns, ["F", "B", "C"])) links_df["link_type"] = links_df["link_type"].map(link_type_map) counts_df = df.melt( - id_vars=[date, sic, cell, question], - value_vars=[f_count, b_count, c_count], + id_vars=groups, + value_vars=count_columns, var_name="link_type_count", value_name="count", ) - link_type_map_count = {f_count: "F", b_count: "B", c_count: "C"} + link_type_map_count = dict(zip(count_columns, ["F", "B", "C"])) counts_df["link_type_count"] = counts_df["link_type_count"].map(link_type_map_count) merged_df = pd.merge( links_df, counts_df, how="outer", - left_on=[date, sic, cell, question, "link_type"], - right_on=[date, sic, cell, question, "link_type_count"], + left_on=groups + ["link_type"], + right_on=groups + ["link_type_count"], ) merged_df.drop_duplicates(inplace=True) merged_df.drop(["link_type_count"], axis=1, inplace=True) - merged_df = merged_df.groupby( - [date, sic, cell, question, "link_type"], as_index=False - ).agg({imputed_value: "sum", "count": "first", "imputation_link": "first"}) + merged_df = merged_df.groupby(groups + ["link_type"], as_index=False).agg( + {imputed_value: "sum", "count": "first", "imputation_link": "first"} + ) sorting_order = {"F": 1, "B": 2, "C": 3} merged_df["sort_column"] = merged_df["link_type"].map(sorting_order) - merged_df = merged_df.sort_values([date, sic, cell, question, "sort_column"]) + merged_df = merged_df.sort_values(groups + ["sort_column"]) merged_df.drop("sort_column", axis=1, inplace=True) merged_df.reset_index(drop=True, inplace=True) merged_df = merged_df[ - [ - date, - sic, - cell, - question, + groups + + [ "imputation_link", "link_type", "count", diff --git a/tests/test_pivot_imputation_value.py b/tests/test_pivot_imputation_value.py index fe4b7f34..4101e1bd 100755 --- a/tests/test_pivot_imputation_value.py +++ b/tests/test_pivot_imputation_value.py @@ -54,17 +54,10 @@ def test_pivot_imputation_value_filter( actual_output = pivot_imputation_value( input_data, "identifier", - "date", - "sic", - "cell", - "forward", - "backward", - "construction", - "question", + ["date", "sic", "cell", "question"], + ["forward", "backward", "construction"], + ["f_count", "b_count", "c_count"], "imputed_value", - "f_count", - "b_count", - "c_count", [202001], ) @@ -82,17 +75,10 @@ def test_pivot_imputation_value_no_filter( actual_output = pivot_imputation_value( input_data, "identifier", - "date", - "sic", - "cell", - "forward", - "backward", - "construction", - "question", + ["date", "sic", "cell", "question"], + ["forward", "backward", "construction"], + ["f_count", "b_count", "c_count"], "imputed_value", - "f_count", - "b_count", - "c_count", ) assert_frame_equal(actual_output, expected_output)