Skip to content

Commit

Permalink
Merge pull request #63 from ONSdigital/465-imputation-links
Browse files Browse the repository at this point in the history
465 imputation links
  • Loading branch information
KateyMatthews authored Aug 19, 2024
2 parents 36b8fd4 + 1badbdf commit 60efe2a
Show file tree
Hide file tree
Showing 5 changed files with 331 additions and 0 deletions.
145 changes: 145 additions & 0 deletions mbs_results/pivot_imputation_value.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import pandas as pd


def merge_counts(
input_df: pd.DataFrame,
count_df: pd.DataFrame,
input_cell: str,
count_cell: str,
input_date: str,
count_date: str,
identifier: str,
) -> pd.DataFrame:
"""
Returns input data with f_count and b_count merged on.
Parameters
----------
input_df : pd.DataFrame
Reference dataframe with identifier, date, sic, cell, forward, backward,
construction, question, imputed_value
count_df : pd.DataFrame
DataFrame with group, period, f_count and b_count
input_cell : str
name of column in input_df dataframe containing cell variable
count_cell : str
name of column in count_df dataframe containing cell variable
input_date : str
name of column in input_df dataframe containing date variable
count_date : str
name of column in count_df dataframe containing date variable
identifier : str
name of column in input_df containing identifier variable
Returns
-------
Dataframe resulting from the left-join of input_df and count_df on the cell and
date columns.
"""
df_merge = pd.merge(
input_df,
count_df,
how="left",
left_on=[input_cell, input_date],
right_on=[count_cell, count_date],
).astype({identifier: "int"})

return df_merge.drop(columns=[count_cell, count_date])


def pivot_imputation_value(
df: pd.DataFrame,
identifier: str,
groups: list,
link_columns: list,
count_columns: list,
imputed_value: str,
selected_periods: list = None,
) -> pd.DataFrame:

"""
Returning dataframe containing imputation_value, filtered by date, pivoted by
imputation type and grouped by sic, cell, question and imputation type.
Parameters
----------
dataframe : pd.DataFrame
Reference dataframe containing links, count values, and imputed values
by identifier, cell, date, and question
identifier : str
name of column in dataframe containing identifier variable
groups : list
link_columns : list
count_columns : list
imputed_value: str
name of column in dataframe containing imputed_value variable
selected_periods: list
list containing periods to include in output
Returns
-------
dataframe filtered by date, containing imputation_value, pivoted by imputation type
and grouped by sic, cell, question and imputation type.
"""
if selected_periods is not None:
df = df.query("{} in {}".format(groups[0], selected_periods))

links_df = df.melt(
id_vars=groups + [imputed_value],
value_vars=link_columns,
var_name="link_type",
value_name="imputation_link",
)

link_type_map = dict(zip(link_columns, ["F", "B", "C"]))
links_df["link_type"] = links_df["link_type"].map(link_type_map)

counts_df = df.melt(
id_vars=groups,
value_vars=count_columns,
var_name="link_type_count",
value_name="count",
)

link_type_map_count = dict(zip(count_columns, ["F", "B", "C"]))
counts_df["link_type_count"] = counts_df["link_type_count"].map(link_type_map_count)

merged_df = pd.merge(
links_df,
counts_df,
how="outer",
left_on=groups + ["link_type"],
right_on=groups + ["link_type_count"],
)

merged_df.drop_duplicates(inplace=True)
merged_df.drop(["link_type_count"], axis=1, inplace=True)

merged_df = merged_df.groupby(groups + ["link_type"], as_index=False).agg(
{imputed_value: "sum", "count": "first", "imputation_link": "first"}
)

sorting_order = {"F": 1, "B": 2, "C": 3}
merged_df["sort_column"] = merged_df["link_type"].map(sorting_order)

merged_df = merged_df.sort_values(groups + ["sort_column"])

merged_df.drop("sort_column", axis=1, inplace=True)

merged_df.reset_index(drop=True, inplace=True)

merged_df = merged_df[
groups
+ [
"imputation_link",
"link_type",
"count",
imputed_value,
]
]

return merged_df
19 changes: 19 additions & 0 deletions tests/data/count_data_input.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
group,period,f_count,b_count,c_count
100,202001,2,3,1
100,202002,1,0,2
100,202003,0,2,3
101,202001,5,2,2
101,202002,2,5,3
101,202003,3,3,4
102,202001,6,4,3
102,202002,1,4,4
102,202003,0,1,5
103,202001,7,8,4
103,202002,0,2,5
103,202003,2,1,6
104,202001,3,5,5
104,202002,3,6,6
104,202003,2,3,7
105,202001,3,7,6
105,202002,4,2,7
105,202003,3,5,8
34 changes: 34 additions & 0 deletions tests/data/merge_counts_output.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
identifier,date,sic,cell,forward,backward,construction,question,imputed_value,f_count,b_count,c_count
70001,202001,12,100,1.0,1.964796824,107.48,40,500,2,3,1
70001,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,,1,0,2
70001,202002,12,100,0.5016253708279402,0.812637487,36.63728374,49,100,1,0,2
70001,202003,12,100,1.201086389,0.705052735,98.26519337,49,200,0,2,3
70002,202001,12,100,1.0,1.964796824,107.48,40,150,2,3,1
70002,202001,12,100,1.326666382,0.599374673,92.06463773,49,250,2,3,1
70002,202002,12,100,0.5089584770000001,0.832579579,54.70285714,40,650,1,0,2
70002,202003,12,100,0.529851921,1.09453303,106.79005520000001,40,800,0,2,3
70003,202001,12,101,1.0,1.964796824,107.48,40,30,5,2,2
70003,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,50,2,5,3
70003,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4
70004,202001,12,101,1.0,1.964796824,107.48,40,,5,2,2
70004,202002,12,101,0.5089584770000001,0.832579579,54.70285714,40,310,2,5,3
70004,202003,12,101,0.529851921,1.09453303,106.79005520000001,40,350,3,3,4
70004,202003,12,101,1.201086389,0.705052735,98.26519337,49,170,3,3,4
70005,202001,12,102,1.326666382,0.599374673,92.06463773,49,750,6,4,3
70005,202002,12,102,0.5016253708279402,0.812637487,36.63728374,49,940,1,4,4
70005,202003,12,102,1.201086389,0.705052735,98.26519337,49,520,0,1,5
70006,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,350,7,8,4
70006,202002,13,103,1.201086389,0.705052735,98.26519337,49,160,0,2,5
70006,202003,13,103,1.418333623,0.642249527,93.18857143,49,380,2,1,6
70007,202001,13,103,0.5089584770000001,0.832579579,54.70285714,40,290,7,8,4
70007,202001,13,103,1.635246638,1.725354675,143.7564782,49,,7,8,4
70007,202002,13,103,0.529851921,1.09453303,106.79005520000001,40,590,0,2,5
70007,202002,13,103,1.201086389,0.705052735,98.26519337,49,710,0,2,5
70007,202003,13,103,0.913631634,1.0,70.24,40,280,2,1,6
70008,202001,13,104,0.5089584770000001,0.832579579,54.70285714,40,,3,5,5
70008,202002,13,104,1.201086389,0.705052735,98.26519337,49,660,3,6,6
70008,202003,13,104,1.418333623,0.642249527,93.18857143,49,220,2,3,7
70009,202001,13,104,1.635246638,1.725354675,143.7564782,49,880,3,5,5
70009,202002,13,104,1.201086389,0.705052735,98.26519337,49,610,3,6,6
70009,202003,13,104,1.418333623,0.642249527,93.18857143,49,90,2,3,7
70010,202001,13,105,0.5089584770000001,0.832579579,54.70285714,40,80,3,7,6
49 changes: 49 additions & 0 deletions tests/data/pivot_imputation_value_output.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
date,sic,cell,question,imputation_link,link_type,count,imputed_value
202001,12,100,40,1.0,F,2,650
202001,12,100,40,1.964796824,B,3,650
202001,12,100,40,107.48,C,1,650
202001,12,100,49,1.326666382,F,2,250
202001,12,100,49,0.599374673,B,3,250
202001,12,100,49,92.06463773,C,1,250
202001,12,101,40,1.0,F,5,30
202001,12,101,40,1.964796824,B,2,30
202001,12,101,40,107.48,C,2,30
202001,12,102,49,1.326666382,F,6,750
202001,12,102,49,0.599374673,B,4,750
202001,12,102,49,92.06463773,C,3,750
202001,13,103,40,0.5089584770000001,F,7,640
202001,13,103,40,0.832579579,B,8,640
202001,13,103,40,54.70285714,C,4,640
202001,13,103,49,1.635246638,F,7,0.0
202001,13,103,49,1.725354675,B,8,0.0
202001,13,103,49,143.7564782,C,4,0.0
202001,13,104,40,0.5089584770000001,F,3,0.0
202001,13,104,40,0.832579579,B,5,0.0
202001,13,104,40,54.70285714,C,5,0.0
202001,13,104,49,1.635246638,F,3,880
202001,13,104,49,1.725354675,B,5,880
202001,13,104,49,143.7564782,C,5,880
202001,13,105,40,0.5089584770000001,F,3,80
202001,13,105,40,0.832579579,B,7,80
202001,13,105,40,54.70285714,C,6,80
202002,12,100,40,0.5089584770000001,F,1,650
202002,12,100,40,0.832579579,B,0,650
202002,12,100,40,54.70285714,C,2,650
202002,12,100,49,0.5016253708279402,F,1,100
202002,12,100,49,0.812637487,B,0,100
202002,12,100,49,36.63728374,C,2,100
202002,12,101,40,0.5089584770000001,F,2,360
202002,12,101,40,0.832579579,B,5,360
202002,12,101,40,54.70285714,C,3,360
202002,12,102,49,0.5016253708279402,F,1,940
202002,12,102,49,0.812637487,B,4,940
202002,12,102,49,36.63728374,C,4,940
202002,13,103,40,0.529851921,F,0,590
202002,13,103,40,1.09453303,B,2,590
202002,13,103,40,106.79005520000001,C,5,590
202002,13,103,49,1.201086389,F,0,870
202002,13,103,49,0.705052735,B,2,870
202002,13,103,49,98.26519337,C,5,870
202002,13,104,49,1.201086389,F,3,1270
202002,13,104,49,0.705052735,B,6,1270
202002,13,104,49,98.26519337,C,6,1270
84 changes: 84 additions & 0 deletions tests/test_pivot_imputation_value.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from pathlib import Path

import pandas as pd
import pytest
from pandas.testing import assert_frame_equal

from mbs_results.pivot_imputation_value import merge_counts, pivot_imputation_value


@pytest.fixture(scope="class")
def filepath():
return Path("tests")


@pytest.fixture(scope="class")
def count_data_input(filepath):
return pd.read_csv(filepath / "data" / "count_data_input.csv", index_col=False)


@pytest.fixture(scope="class")
def merge_counts_output(filepath):
return pd.read_csv(filepath / "data" / "merge_counts_output.csv", index_col=False)


@pytest.fixture(scope="class")
def pivot_imputation_value_output(filepath):
return pd.read_csv(
filepath / "data" / "pivot_imputation_value_output.csv", index_col=False
)


class TestMergeCounts:
def test_merge_counts(self, count_data_input, merge_counts_output):

input_df = merge_counts_output.drop(columns=["f_count", "b_count", "c_count"])

actual_output = merge_counts(
input_df, count_data_input, "cell", "group", "date", "period", "identifier"
)
expected_output = merge_counts_output

assert_frame_equal(actual_output, expected_output)


class TestPivotImputationValue:
def test_pivot_imputation_value_filter(
self, pivot_imputation_value_output, merge_counts_output
):

expected_output = pivot_imputation_value_output.query("date == 202001")

input_data = merge_counts_output.drop(columns=["identifier"])

actual_output = pivot_imputation_value(
input_data,
"identifier",
["date", "sic", "cell", "question"],
["forward", "backward", "construction"],
["f_count", "b_count", "c_count"],
"imputed_value",
[202001],
)

assert_frame_equal(actual_output, expected_output)

def test_pivot_imputation_value_no_filter(
self, pivot_imputation_value_output, merge_counts_output
):

expected_output = pivot_imputation_value_output

input_data = merge_counts_output.drop(columns=["identifier"])
input_data = input_data.query("date in [202001, 202002]")

actual_output = pivot_imputation_value(
input_data,
"identifier",
["date", "sic", "cell", "question"],
["forward", "backward", "construction"],
["f_count", "b_count", "c_count"],
"imputed_value",
)

assert_frame_equal(actual_output, expected_output)

0 comments on commit 60efe2a

Please sign in to comment.