-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #63 from ONSdigital/465-imputation-links
465 imputation links
- Loading branch information
Showing
5 changed files
with
331 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
import pandas as pd | ||
|
||
|
||
def merge_counts( | ||
input_df: pd.DataFrame, | ||
count_df: pd.DataFrame, | ||
input_cell: str, | ||
count_cell: str, | ||
input_date: str, | ||
count_date: str, | ||
identifier: str, | ||
) -> pd.DataFrame: | ||
""" | ||
Returns input data with f_count and b_count merged on. | ||
Parameters | ||
---------- | ||
input_df : pd.DataFrame | ||
Reference dataframe with identifier, date, sic, cell, forward, backward, | ||
construction, question, imputed_value | ||
count_df : pd.DataFrame | ||
DataFrame with group, period, f_count and b_count | ||
input_cell : str | ||
name of column in input_df dataframe containing cell variable | ||
count_cell : str | ||
name of column in count_df dataframe containing cell variable | ||
input_date : str | ||
name of column in input_df dataframe containing date variable | ||
count_date : str | ||
name of column in count_df dataframe containing date variable | ||
identifier : str | ||
name of column in input_df containing identifier variable | ||
Returns | ||
------- | ||
Dataframe resulting from the left-join of input_df and count_df on the cell and | ||
date columns. | ||
""" | ||
df_merge = pd.merge( | ||
input_df, | ||
count_df, | ||
how="left", | ||
left_on=[input_cell, input_date], | ||
right_on=[count_cell, count_date], | ||
).astype({identifier: "int"}) | ||
|
||
return df_merge.drop(columns=[count_cell, count_date]) | ||
|
||
|
||
def pivot_imputation_value( | ||
df: pd.DataFrame, | ||
identifier: str, | ||
groups: list, | ||
link_columns: list, | ||
count_columns: list, | ||
imputed_value: str, | ||
selected_periods: list = None, | ||
) -> pd.DataFrame: | ||
|
||
""" | ||
Returning dataframe containing imputation_value, filtered by date, pivoted by | ||
imputation type and grouped by sic, cell, question and imputation type. | ||
Parameters | ||
---------- | ||
dataframe : pd.DataFrame | ||
Reference dataframe containing links, count values, and imputed values | ||
by identifier, cell, date, and question | ||
identifier : str | ||
name of column in dataframe containing identifier variable | ||
groups : list | ||
link_columns : list | ||
count_columns : list | ||
imputed_value: str | ||
name of column in dataframe containing imputed_value variable | ||
selected_periods: list | ||
list containing periods to include in output | ||
Returns | ||
------- | ||
dataframe filtered by date, containing imputation_value, pivoted by imputation type | ||
and grouped by sic, cell, question and imputation type. | ||
""" | ||
if selected_periods is not None: | ||
df = df.query("{} in {}".format(groups[0], selected_periods)) | ||
|
||
links_df = df.melt( | ||
id_vars=groups + [imputed_value], | ||
value_vars=link_columns, | ||
var_name="link_type", | ||
value_name="imputation_link", | ||
) | ||
|
||
link_type_map = dict(zip(link_columns, ["F", "B", "C"])) | ||
links_df["link_type"] = links_df["link_type"].map(link_type_map) | ||
|
||
counts_df = df.melt( | ||
id_vars=groups, | ||
value_vars=count_columns, | ||
var_name="link_type_count", | ||
value_name="count", | ||
) | ||
|
||
link_type_map_count = dict(zip(count_columns, ["F", "B", "C"])) | ||
counts_df["link_type_count"] = counts_df["link_type_count"].map(link_type_map_count) | ||
|
||
merged_df = pd.merge( | ||
links_df, | ||
counts_df, | ||
how="outer", | ||
left_on=groups + ["link_type"], | ||
right_on=groups + ["link_type_count"], | ||
) | ||
|
||
merged_df.drop_duplicates(inplace=True) | ||
merged_df.drop(["link_type_count"], axis=1, inplace=True) | ||
|
||
merged_df = merged_df.groupby(groups + ["link_type"], as_index=False).agg( | ||
{imputed_value: "sum", "count": "first", "imputation_link": "first"} | ||
) | ||
|
||
sorting_order = {"F": 1, "B": 2, "C": 3} | ||
merged_df["sort_column"] = merged_df["link_type"].map(sorting_order) | ||
|
||
merged_df = merged_df.sort_values(groups + ["sort_column"]) | ||
|
||
merged_df.drop("sort_column", axis=1, inplace=True) | ||
|
||
merged_df.reset_index(drop=True, inplace=True) | ||
|
||
merged_df = merged_df[ | ||
groups | ||
+ [ | ||
"imputation_link", | ||
"link_type", | ||
"count", | ||
imputed_value, | ||
] | ||
] | ||
|
||
return merged_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
group,period,f_count,b_count,c_count | ||
100,202001,2,3,1 | ||
100,202002,1,0,2 | ||
100,202003,0,2,3 | ||
101,202001,5,2,2 | ||
101,202002,2,5,3 | ||
101,202003,3,3,4 | ||
102,202001,6,4,3 | ||
102,202002,1,4,4 | ||
102,202003,0,1,5 | ||
103,202001,7,8,4 | ||
103,202002,0,2,5 | ||
103,202003,2,1,6 | ||
104,202001,3,5,5 | ||
104,202002,3,6,6 | ||
104,202003,2,3,7 | ||
105,202001,3,7,6 | ||
105,202002,4,2,7 | ||
105,202003,3,5,8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
identifier,date,sic,cell,forward,backward,construction,question,imputed_value,f_count,b_count,c_count | ||
70001,202001,12,100,1.0,1.964796824,107.48,40,500,2,3,1 | ||
70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0,2 | ||
70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0,2 | ||
70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200,0,2,3 | ||
70002,202001,12,100,1.0,1.964796824,107.48,40,150,2,3,1 | ||
70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250,2,3,1 | ||
70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0,2 | ||
70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2,3 | ||
70003,202001,12,101,1.0,1.964796824,107.48,40,30,5,2,2 | ||
70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5,3 | ||
70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4 | ||
70004,202001,12,101,1.0,1.964796824,107.48,40,,5,2,2 | ||
70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5,3 | ||
70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3,4 | ||
70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4 | ||
70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750,6,4,3 | ||
70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4,4 | ||
70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520,0,1,5 | ||
70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8,4 | ||
70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160,0,2,5 | ||
70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380,2,1,6 | ||
70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8,4 | ||
70007,202001,13,103,1.635246638,1.725354675,143.7564782,49,,7,8,4 | ||
70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2,5 | ||
70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710,0,2,5 | ||
70007,202003,13,103,0.913631634,1.0,70.24,40,280,2,1,6 | ||
70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5,5 | ||
70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660,3,6,6 | ||
70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220,2,3,7 | ||
70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880,3,5,5 | ||
70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610,3,6,6 | ||
70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90,2,3,7 | ||
70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7,6 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
date,sic,cell,question,imputation_link,link_type,count,imputed_value | ||
202001,12,100,40,1.0,F,2,650 | ||
202001,12,100,40,1.964796824,B,3,650 | ||
202001,12,100,40,107.48,C,1,650 | ||
202001,12,100,49,1.326666382,F,2,250 | ||
202001,12,100,49,0.599374673,B,3,250 | ||
202001,12,100,49,92.06463773,C,1,250 | ||
202001,12,101,40,1.0,F,5,30 | ||
202001,12,101,40,1.964796824,B,2,30 | ||
202001,12,101,40,107.48,C,2,30 | ||
202001,12,102,49,1.326666382,F,6,750 | ||
202001,12,102,49,0.599374673,B,4,750 | ||
202001,12,102,49,92.06463773,C,3,750 | ||
202001,13,103,40,0.5089584770000001,F,7,640 | ||
202001,13,103,40,0.832579579,B,8,640 | ||
202001,13,103,40,54.70285714,C,4,640 | ||
202001,13,103,49,1.635246638,F,7,0.0 | ||
202001,13,103,49,1.725354675,B,8,0.0 | ||
202001,13,103,49,143.7564782,C,4,0.0 | ||
202001,13,104,40,0.5089584770000001,F,3,0.0 | ||
202001,13,104,40,0.832579579,B,5,0.0 | ||
202001,13,104,40,54.70285714,C,5,0.0 | ||
202001,13,104,49,1.635246638,F,3,880 | ||
202001,13,104,49,1.725354675,B,5,880 | ||
202001,13,104,49,143.7564782,C,5,880 | ||
202001,13,105,40,0.5089584770000001,F,3,80 | ||
202001,13,105,40,0.832579579,B,7,80 | ||
202001,13,105,40,54.70285714,C,6,80 | ||
202002,12,100,40,0.5089584770000001,F,1,650 | ||
202002,12,100,40,0.832579579,B,0,650 | ||
202002,12,100,40,54.70285714,C,2,650 | ||
202002,12,100,49,0.5016253708279402,F,1,100 | ||
202002,12,100,49,0.812637487,B,0,100 | ||
202002,12,100,49,36.63728374,C,2,100 | ||
202002,12,101,40,0.5089584770000001,F,2,360 | ||
202002,12,101,40,0.832579579,B,5,360 | ||
202002,12,101,40,54.70285714,C,3,360 | ||
202002,12,102,49,0.5016253708279402,F,1,940 | ||
202002,12,102,49,0.812637487,B,4,940 | ||
202002,12,102,49,36.63728374,C,4,940 | ||
202002,13,103,40,0.529851921,F,0,590 | ||
202002,13,103,40,1.09453303,B,2,590 | ||
202002,13,103,40,106.79005520000001,C,5,590 | ||
202002,13,103,49,1.201086389,F,0,870 | ||
202002,13,103,49,0.705052735,B,2,870 | ||
202002,13,103,49,98.26519337,C,5,870 | ||
202002,13,104,49,1.201086389,F,3,1270 | ||
202002,13,104,49,0.705052735,B,6,1270 | ||
202002,13,104,49,98.26519337,C,6,1270 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
import pytest | ||
from pandas.testing import assert_frame_equal | ||
|
||
from mbs_results.pivot_imputation_value import merge_counts, pivot_imputation_value | ||
|
||
|
||
@pytest.fixture(scope="class") | ||
def filepath(): | ||
return Path("tests") | ||
|
||
|
||
@pytest.fixture(scope="class") | ||
def count_data_input(filepath): | ||
return pd.read_csv(filepath / "data" / "count_data_input.csv", index_col=False) | ||
|
||
|
||
@pytest.fixture(scope="class") | ||
def merge_counts_output(filepath): | ||
return pd.read_csv(filepath / "data" / "merge_counts_output.csv", index_col=False) | ||
|
||
|
||
@pytest.fixture(scope="class") | ||
def pivot_imputation_value_output(filepath): | ||
return pd.read_csv( | ||
filepath / "data" / "pivot_imputation_value_output.csv", index_col=False | ||
) | ||
|
||
|
||
class TestMergeCounts: | ||
def test_merge_counts(self, count_data_input, merge_counts_output): | ||
|
||
input_df = merge_counts_output.drop(columns=["f_count", "b_count", "c_count"]) | ||
|
||
actual_output = merge_counts( | ||
input_df, count_data_input, "cell", "group", "date", "period", "identifier" | ||
) | ||
expected_output = merge_counts_output | ||
|
||
assert_frame_equal(actual_output, expected_output) | ||
|
||
|
||
class TestPivotImputationValue: | ||
def test_pivot_imputation_value_filter( | ||
self, pivot_imputation_value_output, merge_counts_output | ||
): | ||
|
||
expected_output = pivot_imputation_value_output.query("date == 202001") | ||
|
||
input_data = merge_counts_output.drop(columns=["identifier"]) | ||
|
||
actual_output = pivot_imputation_value( | ||
input_data, | ||
"identifier", | ||
["date", "sic", "cell", "question"], | ||
["forward", "backward", "construction"], | ||
["f_count", "b_count", "c_count"], | ||
"imputed_value", | ||
[202001], | ||
) | ||
|
||
assert_frame_equal(actual_output, expected_output) | ||
|
||
def test_pivot_imputation_value_no_filter( | ||
self, pivot_imputation_value_output, merge_counts_output | ||
): | ||
|
||
expected_output = pivot_imputation_value_output | ||
|
||
input_data = merge_counts_output.drop(columns=["identifier"]) | ||
input_data = input_data.query("date in [202001, 202002]") | ||
|
||
actual_output = pivot_imputation_value( | ||
input_data, | ||
"identifier", | ||
["date", "sic", "cell", "question"], | ||
["forward", "backward", "construction"], | ||
["f_count", "b_count", "c_count"], | ||
"imputed_value", | ||
) | ||
|
||
assert_frame_equal(actual_output, expected_output) |