Skip to content

Commit

Permalink
Tidying up and adding period arguments to config
Browse files Browse the repository at this point in the history
  • Loading branch information
lhubbardONS committed Jul 19, 2024
1 parent b43b97b commit 2a08df4
Show file tree
Hide file tree
Showing 10 changed files with 186 additions and 22 deletions.
4 changes: 3 additions & 1 deletion config.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@
"frozenemployees" : "int",
"frozenturnover" : "float",
"cellnumber" : "int"},
"temporarily_remove_cols": []
"temporarily_remove_cols": [],
"current_period" : 202401,
"previous_period" : 202312
}
5 changes: 1 addition & 4 deletions mbs_results/aggregate_aweights_by_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

def aggregate_aweights_by_class(
dataframe: pd.DataFrame,
reference: str,
imp_class: str,
period: str,
a_weight: str,
Expand All @@ -17,8 +16,6 @@ def aggregate_aweights_by_class(
----------
dataframe : pd.DataFrame
Reference dataframe with imp_class and a_weights
reference : str
name of column in dataframe containing reference variable
imp_class : str
name of column in dataframe containing imp_class variable
period : str
Expand All @@ -36,6 +33,6 @@ def aggregate_aweights_by_class(
"""
current_df = dataframe[dataframe[period] == current_period]

aggregate_df = current_df[[imp_class, a_weight]].drop_duplicates()
aggregate_df = current_df[[period, imp_class, a_weight]].drop_duplicates()

return aggregate_df.reset_index(drop=True)
88 changes: 88 additions & 0 deletions mbs_results/selective_editing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pandas as pd


def create_standardising_factor(
dataframe: pd.DataFrame,
reference: str,
period: str,
domain: str,
question_code: str,
predicted_value: str,
imputation_marker: str,
a_weight: str,
o_weight: str,
g_weight: str,
auxiliary_value: str,
previous_period: int,
) -> pd.DataFrame:
"""
Returning standardising factor summed by domain for questions 40 and 49.
Standardising factor estimated using a_weights, o_weights and g_weights.
Parameters
----------
dataframe : pd.DataFrame
Reference dataframe with domain, a_weights, o_weights, and g_weights
reference : str
name of column in dataframe containing reference variable
period : str
name of column in dataframe containing period variable
domain : str
name of column in dataframe containing domain variable
question_code : str
name of column in dataframe containing question code variable
predicted_value : str
name of column in dataframe containing predicted value variable
imputation_marker : str
name of column in dataframe containing imputation marker variable
a_weight : str
name of column in dataframe containing a_weight variable
o_weight : str
name of column in dataframe containing o_weight variable
g_weight : str
name of column in dataframe containing g_weight variable
auxiliary_value : str
name of column in dataframe containing auxiliary value variable
previous_period : int
previous period to take the weights for estimation of standardising factor in
the format yyyymm
Returns
-------
pd.DataFrame
dataframe with standardising factor estimated and summed by domain for
each reference.
"""
previous_df = dataframe[(dataframe[period] == previous_period)]
previous_df = previous_df[previous_df[question_code].isin([40, 49])]

previous_df["standardising_factor"] = (
previous_df[predicted_value]
* previous_df[a_weight]
* previous_df[o_weight]
* previous_df[g_weight]
)

previous_df = previous_df.assign(
standardising_factor=lambda x: x.groupby([domain, question_code]).transform(
"sum"
)["standardising_factor"]
)
previous_df["standardising_factor"] = previous_df["standardising_factor"].astype(
float
)

output_df = previous_df[
[
period,
reference,
question_code,
"standardising_factor",
predicted_value,
imputation_marker,
auxiliary_value,
]
]

return output_df.reset_index(drop=True)
4 changes: 2 additions & 2 deletions mbs_results/selective_editing_question.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd


def create_selective_editing_question(
def create_standardising_factor(
dataframe: pd.DataFrame,
reference: str,
period: str,
Expand Down Expand Up @@ -54,7 +54,7 @@ def create_selective_editing_question(
each reference.
"""
previous_df = dataframe[dataframe[period] == previous_period]
previous_df = dataframe[(dataframe[period] == previous_period)]
previous_df = previous_df[previous_df[question_code].isin([40, 49])]

previous_df["standardising_factor"] = (
Expand Down
6 changes: 3 additions & 3 deletions tests/data/estimation/aggregate_aweights_by_class_output.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
imp_class,a_weight
2,2.0
3,4.0
period,imp_class,a_weight
202402,2,2.0
202402,3,4.0
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ reference,period,domain,question_code,predicted_value,imputation_marker,a_weight
10001,202402,0,40,604,R,5.0,1.0,2.0,39,
10002,202312,6,49,233,FIR,6.0,1.0,1.0,15,
10002,202401,6,49,432,FIR,1.0,3.0,2.0,45,2592.0
10002,202402,9,49,,,2.0,2.0,2.0,73,
10002,202402,6,49,,,2.0,2.0,2.0,73,
10003,202401,9,49,150,C,4.0,5.0,1.0,21,3000.0
10003,202402,9,49,837,R,3.0,7.0,4.0,74,
10004,202401,15,20,111,FIC,1.0,4.0,6.0,11,
Expand All @@ -12,3 +12,4 @@ reference,period,domain,question_code,predicted_value,imputation_marker,a_weight
10006,202401,19,40,336,C,2.0,1.0,4.0,14,2688.0
10006,202401,19,49,461,C,3.0,1.0,4.0,58,9084.0
10007,202401,19,49,222,BIR,4.0,2.0,2.0,67,9084.0
10008,202401,17,40,,,4.0,1.0,5.0,48,12012.0
15 changes: 15 additions & 0 deletions tests/data/estimation/create_standardising_factor_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
reference,period,domain,question_code,predicted_value,imputation_marker,a_weight,o_weight,g_weight,auxiliary_value,standardising_factor
10001,202401,0,40,589,R,2.0,4.0,1.0,23,4712.0
10001,202402,0,40,604,R,5.0,1.0,2.0,39,
10002,202312,6,49,233,FIR,6.0,1.0,1.0,15,
10002,202401,6,49,432,FIR,1.0,3.0,2.0,45,2592.0
10002,202402,6,49,,,2.0,2.0,2.0,73,
10003,202401,9,49,150,C,4.0,5.0,1.0,21,3000.0
10003,202402,9,49,837,R,3.0,7.0,4.0,74,
10004,202401,15,20,111,FIC,1.0,4.0,6.0,11,
10005,202401,17,40,1001,FIR,2.0,2.0,3.0,95,12012.0
10005,202402,19,40,532,FIR,7.0,3.0,3.0,29,
10006,202401,19,40,336,C,2.0,1.0,4.0,14,2688.0
10006,202401,19,49,461,C,3.0,1.0,4.0,58,9084.0
10007,202401,19,49,222,BIR,4.0,2.0,2.0,67,9084.0
10008,202401,17,40,,,4.0,1.0,5.0,48,12012.0
1 change: 0 additions & 1 deletion tests/test_aggregate_aweights_by_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ def test_aggregate_aweights_by_class(

actual_output = aggregate_aweights_by_class(
input_data,
"reference",
"imp_class",
"period",
"a_weight",
Expand Down
20 changes: 10 additions & 10 deletions tests/test_create_selective_editing_question.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from pandas.testing import assert_frame_equal

from mbs_results.selective_editing_question import create_selective_editing_question
from mbs_results.selective_editing import create_standardising_factor


@pytest.fixture(scope="class")
Expand All @@ -13,19 +13,19 @@ def filepath():


@pytest.fixture(scope="class")
def create_selective_editing_question_input(filepath):
def create_standardising_factor_data(filepath):
return pd.read_csv(
filepath / "create_selective_editing_question_input.csv", index_col=False
filepath / "create_standardising_factor_data.csv", index_col=False
)


class TestCreateSelectiveEditingQuestion:
def test_create_selective_editing_question(
class TestCreateStandardisingFactor:
def test_create_standardising_factor(
self,
create_selective_editing_question_input,
create_standardising_factor_data,
):
expected_output = create_selective_editing_question_input[
create_selective_editing_question_input["standardising_factor"].notna()
expected_output = create_standardising_factor_data[
create_standardising_factor_data["standardising_factor"].notna()
]
expected_output = expected_output[
[
Expand All @@ -40,11 +40,11 @@ def test_create_selective_editing_question(
]
expected_output = expected_output.reset_index(drop=True)

input_data = create_selective_editing_question_input.drop(
input_data = create_standardising_factor_data.drop(
columns="standardising_factor"
)

actual_output = create_selective_editing_question(
actual_output = create_standardising_factor(
input_data,
"reference",
"period",
Expand Down
62 changes: 62 additions & 0 deletions tests/test_create_standardising_factor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from pathlib import Path

import pandas as pd
import pytest
from pandas.testing import assert_frame_equal

from mbs_results.selective_editing import create_standardising_factor


@pytest.fixture(scope="class")
def filepath():
return Path("tests/data/estimation")


@pytest.fixture(scope="class")
def create_standardising_factor_data(filepath):
return pd.read_csv(
filepath / "create_standardising_factor_data.csv", index_col=False
)


class TestCreateStandardisingFactor:
def test_create_standardising_factor(
self,
create_standardising_factor_data,
):
expected_output = create_standardising_factor_data[
create_standardising_factor_data["standardising_factor"].notna()
]
expected_output = expected_output[
[
"period",
"reference",
"question_code",
"standardising_factor",
"predicted_value",
"imputation_marker",
"auxiliary_value",
]
]
expected_output = expected_output.reset_index(drop=True)

input_data = create_standardising_factor_data.drop(
columns="standardising_factor"
)

actual_output = create_standardising_factor(
input_data,
"reference",
"period",
"domain",
"question_code",
"predicted_value",
"imputation_marker",
"a_weight",
"o_weight",
"g_weight",
"auxiliary_value",
202401,
)

assert_frame_equal(actual_output, expected_output)

0 comments on commit 2a08df4

Please sign in to comment.