From 41b4ca280e1b1e0d6deeb380125179d7fa747804 Mon Sep 17 00:00:00 2001 From: rowanhemsi Date: Thu, 4 Jul 2024 13:30:51 +0100 Subject: [PATCH 1/3] Add missing value validation --- mbs_results/validate_imputation.py | 39 +++++++++++++++++++ .../target_missing_values.csv | 4 ++ tests/helper_functions.py | 8 ++++ tests/test_validate_imputation.py | 36 +++++++++++++++++ 4 files changed, 87 insertions(+) create mode 100644 mbs_results/validate_imputation.py create mode 100644 tests/data/validate_imputation/target_missing_values.csv create mode 100644 tests/test_validate_imputation.py diff --git a/mbs_results/validate_imputation.py b/mbs_results/validate_imputation.py new file mode 100644 index 00000000..1a95eb5e --- /dev/null +++ b/mbs_results/validate_imputation.py @@ -0,0 +1,39 @@ +import pandas as pd + + +def validate_imputation(df: pd.DataFrame, target: str) -> None: + """_summary_ + + Parameters + ---------- + df : pd.DataFrame + data with imputed values + target : str + name of column containing target variable + + Raises + ------ + """ + if column_missing_values(df[target]): + raise ValueError( + f""" + Target column should have no missing values following imputation: + missing values found in column {target} + """ + ) + + +def column_missing_values(target_column: pd.Series) -> bool: + """_summary_ + + Parameters + ---------- + target_column : pd.Series + dataframe column to search for missing values + + Returns + ------- + bool + True if missing values found, otherwise False + """ + return target_column.isna().any() diff --git a/tests/data/validate_imputation/target_missing_values.csv b/tests/data/validate_imputation/target_missing_values.csv new file mode 100644 index 00000000..da53453b --- /dev/null +++ b/tests/data/validate_imputation/target_missing_values.csv @@ -0,0 +1,4 @@ +no_missing,one_missing,all_missing +11,14,, +12,15,, +13,, \ No newline at end of file diff --git a/tests/helper_functions.py b/tests/helper_functions.py index cba874f3..aa7a41f7 100644 --- a/tests/helper_functions.py +++ b/tests/helper_functions.py @@ -1,3 +1,4 @@ +from contextlib import contextmanager from pathlib import Path import pandas as pd @@ -22,3 +23,10 @@ def load_filter(filter_path): df["date"] = pd.to_datetime(df["date"], format="%Y%m") return df + + +# when updating to python>=3.7 this can be replaced by importing +# contextlib.nullcontext as does_not_raise +@contextmanager +def does_not_raise(): + yield diff --git a/tests/test_validate_imputation.py b/tests/test_validate_imputation.py new file mode 100644 index 00000000..f8c2ded1 --- /dev/null +++ b/tests/test_validate_imputation.py @@ -0,0 +1,36 @@ +from pathlib import Path + +import pandas as pd +import pytest +from helper_functions import does_not_raise + +from mbs_results.validate_imputation import validate_imputation + + +@pytest.fixture(scope="class") +def filepath(): + return Path("tests/data/validate_imputation") + + +@pytest.fixture(scope="class") +def missing_target_values_data(filepath): + return pd.read_csv(filepath / "target_missing_values.csv", index_col=False) + + +class TestValidateImputation: + @pytest.mark.parametrize( + "target_column_name,expectation", + [ + ("no_missing", does_not_raise()), + ("one_missing", pytest.raises(ValueError)), + ("all_missing", pytest.raises(ValueError)), + ], + ) + def test_target_missing_values_validation( + self, + missing_target_values_data, + target_column_name, + expectation, + ): + with expectation: + validate_imputation(missing_target_values_data, target_column_name) From 057b9283818171eb8f6defe5f0a64b03949192ad Mon Sep 17 00:00:00 2001 From: hemsir Date: Thu, 11 Jul 2024 10:29:55 +0100 Subject: [PATCH 2/3] Add newline at end of file --- tests/data/validate_imputation/target_missing_values.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/validate_imputation/target_missing_values.csv b/tests/data/validate_imputation/target_missing_values.csv index da53453b..34e7f214 100644 --- a/tests/data/validate_imputation/target_missing_values.csv +++ b/tests/data/validate_imputation/target_missing_values.csv @@ -1,4 +1,4 @@ no_missing,one_missing,all_missing 11,14,, 12,15,, -13,, \ No newline at end of file +13,, From 4f4cb3f1d1cc6970b97852340f4c0d7756224760 Mon Sep 17 00:00:00 2001 From: hemsir Date: Thu, 11 Jul 2024 15:52:35 +0100 Subject: [PATCH 3/3] Simplify code and add documentation summary --- mbs_results/validate_imputation.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/mbs_results/validate_imputation.py b/mbs_results/validate_imputation.py index 1a95eb5e..1de65f25 100644 --- a/mbs_results/validate_imputation.py +++ b/mbs_results/validate_imputation.py @@ -2,7 +2,9 @@ def validate_imputation(df: pd.DataFrame, target: str) -> None: - """_summary_ + """ + Validation for the imputation, including: + - no missing values in target column Parameters ---------- @@ -14,26 +16,10 @@ def validate_imputation(df: pd.DataFrame, target: str) -> None: Raises ------ """ - if column_missing_values(df[target]): + if df[target].isna().any(): raise ValueError( f""" Target column should have no missing values following imputation: missing values found in column {target} """ ) - - -def column_missing_values(target_column: pd.Series) -> bool: - """_summary_ - - Parameters - ---------- - target_column : pd.Series - dataframe column to search for missing values - - Returns - ------- - bool - True if missing values found, otherwise False - """ - return target_column.isna().any()