From d73fbe00002ff12e1cac7d51548475a83eafa1e0 Mon Sep 17 00:00:00 2001
From: zogkoa <Anton.Zogkolli@ons.gov.uk>
Date: Thu, 18 Jul 2024 16:57:06 +0100
Subject: [PATCH] Add cleaning fun for imputation prep

* Convert cell_no for imputation class, plus test and test data
* Run frozen or live, plus test and test data
* Convert annual to monthly basic transformation
---
 mbs_results/data_cleaning.py           | 96 ++++++++++++++++++++++++++
 tests/test_create_imputation_class.csv | 18 +++++
 tests/test_data_cleaning.py            | 34 ++++++++-
 tests/test_run_live_or_frozen.csv      |  8 +++
 4 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_create_imputation_class.csv
 create mode 100644 tests/test_run_live_or_frozen.csv

diff --git a/mbs_results/data_cleaning.py b/mbs_results/data_cleaning.py
index 92fceca5..5997e095 100755
--- a/mbs_results/data_cleaning.py
+++ b/mbs_results/data_cleaning.py
@@ -1,3 +1,6 @@
+from typing import List
+
+import numpy as np
 import pandas as pd
 
 from mbs_results.utils import convert_column_to_datetime
@@ -177,3 +180,96 @@ def load_manual_constructions(
     return df.merge(
         manual_constructions, on=[reference, period], how="outer", suffixes=("", "_man")
     )
+
+
+def run_live_or_frozen(
+    df: pd.DataFrame,
+    target: str or list[str],
+    error_marker: str,
+    state: str = "live",
+    error_values: List[str] = ["E", "W"],
+) -> pd.DataFrame:
+
+    """
+    For frozen, therefore target values are converted to null, hence responses
+    in error are treated as non-response.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe.
+    target : str or list[str]
+        Column(s) to treat as non-response.
+    error_marker : str
+        Column name with error values.
+    state : str, optional
+        Function config parameter. The default is "live".
+    error_values : list[str], optional
+        Values to ignore. The default is ['E', 'W'].
+
+    Returns
+    -------
+    Original dataframe.
+
+    """
+
+    # TODO: raise error if state is not frozen or live
+
+    if state == "frozen":
+
+        df.loc[df[error_marker].isin(error_values), target] = np.nan
+
+    return df
+
+
+def convert_annual_thousands(df: pd.DataFrame, col: str) -> pd.DataFrame:
+    """Convert values from annual £000s to monthly £.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe.
+    col : str
+        Col name of df.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Original dataframe.
+
+    """
+
+    df[col] = df[col] * 1000 / 12
+
+    return df
+
+
+def create_imputation_class(
+    df: pd.DataFrame, cell_no_col: str, new_col: str
+) -> pd.DataFrame:
+    """
+    Replaces the first character '7' with '5' and removes the last character in
+    all values in a column.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Original dataframe.
+    cell_no_col : str
+        Column name of df.
+    new_col : str
+        Column name to save the results.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Oringal dataframe with new_col.
+    """
+    df[new_col] = (
+        df[cell_no_col]
+        .astype(str)
+        .map(lambda x: str(5) + x[1:-1] if x[0] == str(7) else x[:-1])
+        .astype(int)
+    )
+
+    return df
diff --git a/tests/test_create_imputation_class.csv b/tests/test_create_imputation_class.csv
new file mode 100644
index 00000000..e80afb28
--- /dev/null
+++ b/tests/test_create_imputation_class.csv
@@ -0,0 +1,18 @@
+cell_no,expected
+1111,111
+2222,222
+3333,333
+4444,444
+5555,555
+6666,666
+7777,577
+8888,888
+9999,999
+1234,123
+4321,432
+6789,678
+9876,987
+7895,589
+8975,897
+8957,895
+7000,500
diff --git a/tests/test_data_cleaning.py b/tests/test_data_cleaning.py
index e4a5277d..cf1f9b65 100644
--- a/tests/test_data_cleaning.py
+++ b/tests/test_data_cleaning.py
@@ -3,7 +3,12 @@
 import pandas as pd
 from pandas.testing import assert_frame_equal
 
-from mbs_results.data_cleaning import clean_and_merge, enforce_datatypes
+from mbs_results.data_cleaning import (
+    clean_and_merge,
+    create_imputation_class,
+    enforce_datatypes,
+    run_live_or_frozen,
+)
 
 
 def correct_types(df):
@@ -75,3 +80,30 @@ def test_clean_and_merge():
         ["reference", "period"]
     )
     assert_frame_equal(actual_output, expected_output)
+
+
+def test_create_imputation_class():
+
+    expected_output = pd.read_csv(Path("tests") / "test_create_imputation_class.csv")
+
+    df_in = expected_output.drop(columns=["expected"])
+
+    actual_output = create_imputation_class(df_in, "cell_no", "expected")
+
+    assert_frame_equal(actual_output, expected_output)
+
+
+def test_run_live_or_frozen():
+
+    df = pd.read_csv(Path("tests") / "test_run_live_or_frozen.csv")
+
+    df_in = df.drop(columns=["frozen"])
+
+    live_ouput = run_live_or_frozen(df_in, "target", "error", "live")
+    frozen_output = run_live_or_frozen(df_in, "target", "error", "frozen")
+
+    expected_output_frozen = df_in.copy()
+    expected_output_frozen["target"] = df["frozen"]
+
+    assert_frame_equal(frozen_output, expected_output_frozen)
+    assert_frame_equal(live_ouput, df_in)
diff --git a/tests/test_run_live_or_frozen.csv b/tests/test_run_live_or_frozen.csv
new file mode 100644
index 00000000..c68ed0e5
--- /dev/null
+++ b/tests/test_run_live_or_frozen.csv
@@ -0,0 +1,8 @@
+target,error,live,frozen
+1,C,1,1
+2,E,2,
+3,O,3,3
+4,W,4,
+5,C,5,5
+6,E,6,
+7,W,7,