Skip to content

Commit

Permalink
Add cleaning fun for imputation prep
Browse files Browse the repository at this point in the history
* Convert cell_no for imputation class, plus test and test data
* Run frozen or live, plus test and test data
* Convert annual to monthly basic transformation
  • Loading branch information
AntonZogk committed Jul 18, 2024
1 parent e84ad97 commit d73fbe0
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 1 deletion.
96 changes: 96 additions & 0 deletions mbs_results/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import List

import numpy as np
import pandas as pd

from mbs_results.utils import convert_column_to_datetime
Expand Down Expand Up @@ -177,3 +180,96 @@ def load_manual_constructions(
return df.merge(
manual_constructions, on=[reference, period], how="outer", suffixes=("", "_man")
)


def run_live_or_frozen(
df: pd.DataFrame,
target: str or list[str],
error_marker: str,
state: str = "live",
error_values: List[str] = ["E", "W"],
) -> pd.DataFrame:

"""
For frozen, therefore target values are converted to null, hence responses
in error are treated as non-response.
Parameters
----------
df : pd.DataFrame
Original dataframe.
target : str or list[str]
Column(s) to treat as non-response.
error_marker : str
Column name with error values.
state : str, optional
Function config parameter. The default is "live".
error_values : list[str], optional
Values to ignore. The default is ['E', 'W'].
Returns
-------
Original dataframe.
"""

# TODO: raise error if state is not frozen or live

if state == "frozen":

df.loc[df[error_marker].isin(error_values), target] = np.nan

return df


def convert_annual_thousands(df: pd.DataFrame, col: str) -> pd.DataFrame:
"""Convert values from annual £000s to monthly £.
Parameters
----------
df : pd.DataFrame
Original dataframe.
col : str
Col name of df.
Returns
-------
df : pd.DataFrame
Original dataframe.
"""

df[col] = df[col] * 1000 / 12

return df


def create_imputation_class(
df: pd.DataFrame, cell_no_col: str, new_col: str
) -> pd.DataFrame:
"""
Replaces the first character '7' with '5' and removes the last character in
all values in a column.
Parameters
----------
df : pd.DataFrame
Original dataframe.
cell_no_col : str
Column name of df.
new_col : str
Column name to save the results.
Returns
-------
df : pd.DataFrame
Oringal dataframe with new_col.
"""
df[new_col] = (
df[cell_no_col]
.astype(str)
.map(lambda x: str(5) + x[1:-1] if x[0] == str(7) else x[:-1])
.astype(int)
)

return df
18 changes: 18 additions & 0 deletions tests/test_create_imputation_class.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
cell_no,expected
1111,111
2222,222
3333,333
4444,444
5555,555
6666,666
7777,577
8888,888
9999,999
1234,123
4321,432
6789,678
9876,987
7895,589
8975,897
8957,895
7000,500
34 changes: 33 additions & 1 deletion tests/test_data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
import pandas as pd
from pandas.testing import assert_frame_equal

from mbs_results.data_cleaning import clean_and_merge, enforce_datatypes
from mbs_results.data_cleaning import (
clean_and_merge,
create_imputation_class,
enforce_datatypes,
run_live_or_frozen,
)


def correct_types(df):
Expand Down Expand Up @@ -75,3 +80,30 @@ def test_clean_and_merge():
["reference", "period"]
)
assert_frame_equal(actual_output, expected_output)


def test_create_imputation_class():

expected_output = pd.read_csv(Path("tests") / "test_create_imputation_class.csv")

df_in = expected_output.drop(columns=["expected"])

actual_output = create_imputation_class(df_in, "cell_no", "expected")

assert_frame_equal(actual_output, expected_output)


def test_run_live_or_frozen():

df = pd.read_csv(Path("tests") / "test_run_live_or_frozen.csv")

df_in = df.drop(columns=["frozen"])

live_ouput = run_live_or_frozen(df_in, "target", "error", "live")
frozen_output = run_live_or_frozen(df_in, "target", "error", "frozen")

expected_output_frozen = df_in.copy()
expected_output_frozen["target"] = df["frozen"]

assert_frame_equal(frozen_output, expected_output_frozen)
assert_frame_equal(live_ouput, df_in)
8 changes: 8 additions & 0 deletions tests/test_run_live_or_frozen.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
target,error,live,frozen
1,C,1,1
2,E,2,
3,O,3,3
4,W,4,
5,C,5,5
6,E,6,
7,W,7,

0 comments on commit d73fbe0

Please sign in to comment.