From d73fbe00002ff12e1cac7d51548475a83eafa1e0 Mon Sep 17 00:00:00 2001 From: zogkoa Date: Thu, 18 Jul 2024 16:57:06 +0100 Subject: [PATCH] Add cleaning fun for imputation prep * Convert cell_no for imputation class, plus test and test data * Run frozen or live, plus test and test data * Convert annual to monthly basic transformation --- mbs_results/data_cleaning.py | 96 ++++++++++++++++++++++++++ tests/test_create_imputation_class.csv | 18 +++++ tests/test_data_cleaning.py | 34 ++++++++- tests/test_run_live_or_frozen.csv | 8 +++ 4 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 tests/test_create_imputation_class.csv create mode 100644 tests/test_run_live_or_frozen.csv diff --git a/mbs_results/data_cleaning.py b/mbs_results/data_cleaning.py index 92fceca5..5997e095 100755 --- a/mbs_results/data_cleaning.py +++ b/mbs_results/data_cleaning.py @@ -1,3 +1,6 @@ +from typing import List + +import numpy as np import pandas as pd from mbs_results.utils import convert_column_to_datetime @@ -177,3 +180,96 @@ def load_manual_constructions( return df.merge( manual_constructions, on=[reference, period], how="outer", suffixes=("", "_man") ) + + +def run_live_or_frozen( + df: pd.DataFrame, + target: str or list[str], + error_marker: str, + state: str = "live", + error_values: List[str] = ["E", "W"], +) -> pd.DataFrame: + + """ + For frozen, therefore target values are converted to null, hence responses + in error are treated as non-response. + + Parameters + ---------- + df : pd.DataFrame + Original dataframe. + target : str or list[str] + Column(s) to treat as non-response. + error_marker : str + Column name with error values. + state : str, optional + Function config parameter. The default is "live". + error_values : list[str], optional + Values to ignore. The default is ['E', 'W']. + + Returns + ------- + Original dataframe. + + """ + + # TODO: raise error if state is not frozen or live + + if state == "frozen": + + df.loc[df[error_marker].isin(error_values), target] = np.nan + + return df + + +def convert_annual_thousands(df: pd.DataFrame, col: str) -> pd.DataFrame: + """Convert values from annual £000s to monthly £. + + Parameters + ---------- + df : pd.DataFrame + Original dataframe. + col : str + Col name of df. + + Returns + ------- + df : pd.DataFrame + Original dataframe. + + """ + + df[col] = df[col] * 1000 / 12 + + return df + + +def create_imputation_class( + df: pd.DataFrame, cell_no_col: str, new_col: str +) -> pd.DataFrame: + """ + Replaces the first character '7' with '5' and removes the last character in + all values in a column. + + Parameters + ---------- + df : pd.DataFrame + Original dataframe. + cell_no_col : str + Column name of df. + new_col : str + Column name to save the results. + + Returns + ------- + df : pd.DataFrame + Oringal dataframe with new_col. + """ + df[new_col] = ( + df[cell_no_col] + .astype(str) + .map(lambda x: str(5) + x[1:-1] if x[0] == str(7) else x[:-1]) + .astype(int) + ) + + return df diff --git a/tests/test_create_imputation_class.csv b/tests/test_create_imputation_class.csv new file mode 100644 index 00000000..e80afb28 --- /dev/null +++ b/tests/test_create_imputation_class.csv @@ -0,0 +1,18 @@ +cell_no,expected +1111,111 +2222,222 +3333,333 +4444,444 +5555,555 +6666,666 +7777,577 +8888,888 +9999,999 +1234,123 +4321,432 +6789,678 +9876,987 +7895,589 +8975,897 +8957,895 +7000,500 diff --git a/tests/test_data_cleaning.py b/tests/test_data_cleaning.py index e4a5277d..cf1f9b65 100644 --- a/tests/test_data_cleaning.py +++ b/tests/test_data_cleaning.py @@ -3,7 +3,12 @@ import pandas as pd from pandas.testing import assert_frame_equal -from mbs_results.data_cleaning import clean_and_merge, enforce_datatypes +from mbs_results.data_cleaning import ( + clean_and_merge, + create_imputation_class, + enforce_datatypes, + run_live_or_frozen, +) def correct_types(df): @@ -75,3 +80,30 @@ def test_clean_and_merge(): ["reference", "period"] ) assert_frame_equal(actual_output, expected_output) + + +def test_create_imputation_class(): + + expected_output = pd.read_csv(Path("tests") / "test_create_imputation_class.csv") + + df_in = expected_output.drop(columns=["expected"]) + + actual_output = create_imputation_class(df_in, "cell_no", "expected") + + assert_frame_equal(actual_output, expected_output) + + +def test_run_live_or_frozen(): + + df = pd.read_csv(Path("tests") / "test_run_live_or_frozen.csv") + + df_in = df.drop(columns=["frozen"]) + + live_ouput = run_live_or_frozen(df_in, "target", "error", "live") + frozen_output = run_live_or_frozen(df_in, "target", "error", "frozen") + + expected_output_frozen = df_in.copy() + expected_output_frozen["target"] = df["frozen"] + + assert_frame_equal(frozen_output, expected_output_frozen) + assert_frame_equal(live_ouput, df_in) diff --git a/tests/test_run_live_or_frozen.csv b/tests/test_run_live_or_frozen.csv new file mode 100644 index 00000000..c68ed0e5 --- /dev/null +++ b/tests/test_run_live_or_frozen.csv @@ -0,0 +1,8 @@ +target,error,live,frozen +1,C,1,1 +2,E,2, +3,O,3,3 +4,W,4, +5,C,5,5 +6,E,6, +7,W,7,