Skip to content

Commit

Permalink
432 further config validation (#40)
Browse files Browse the repository at this point in the history
* Added validation function to check period and response given

* Added tests for clean_and_merge function

* Remove code which is not needed

* Created unit tests for validation cases

* Added summary and updated names for loop elements

* Pre-commit hook changes
  • Loading branch information
Jday7879 authored and lhubbardONS committed Jul 4, 2024
1 parent be222fd commit 42dd43f
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 15 deletions.
68 changes: 56 additions & 12 deletions mbs_results/validation_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,15 @@ def validate_config(config):

if colnames_clash(**config):
raise ValueError(
"""Overlapping column names in responses_keep_cols and and
"""Overlapping column names in responses_keep_cols and
contributors_keep_cols (main config)."""
)
if period_and_reference_not_given(**config):
raise ValueError(
"""Period and/or Reference is not given in responses_keep_cols
and/or contributors_keep_cols (main config). """
)

validate_config_datatype_input(**config)
validate_config_repeated_datatypes(**config)

Expand All @@ -39,16 +45,54 @@ def colnames_clash(
main pipeline configuration. Can be used to input the entire config dictionary
Returns
-------
list
list containing column names which are in both contributors and responses,
bool
Returns true if any column names are in both contributors and responses,
excluding period and reference.
False otherwise
"""

return any(
[
column in contributors_keep_cols and column not in [reference, period]
for column in responses_keep_cols
]
)


def period_and_reference_not_given(
reference, period, responses_keep_cols, contributors_keep_cols, **config
):
"""
Function checks that both reference and period columns are supplied in
response_keep_cols and contributors_keep_cols.
Returns True if period or reference is missing from either
responses_keep_cols and contributors_keep_cols, False otherwise
reference: Str
the name of the reference column
period: Str
the name of the period column
response_keep_cols: Str
the names of the columns to keep from the responses data
contributors_keep_cols: Str
the names of the columns to keep from the contributors data
**config: Dict
main pipeline configuration. Can be used to input the entire config dictionary
Returns
-------
bool
Returns True if response and period are not in both responses_keep_cols
and contributors_keep_cols
False otherwise
"""

return any(
[
i in contributors_keep_cols and i not in [reference, period]
for i in responses_keep_cols
column
for column in [reference, period]
if (column not in responses_keep_cols)
or (column not in contributors_keep_cols)
]
)

Expand Down Expand Up @@ -101,14 +145,14 @@ def validate_config_datatype_input(
joint_dict = {**responses_keep_cols, **contributors_keep_cols}
accepted_types = ["str", "float", "int", "date", "bool", "category"]
incorrect_datatype = [
x for x in list(joint_dict) if joint_dict.get(x) not in accepted_types
item for item in list(joint_dict) if joint_dict.get(item) not in accepted_types
]

if incorrect_datatype:
given_types = [joint_dict.get(key) for key in incorrect_datatype]
raise ValueError(
"Check the inputted datatype(s) for column(s) {}:{},\
only the following datatypes are accepted: {}".format(
"""Check the inputted datatype(s) for column(s) {}:{},
only the following datatypes are accepted: {}""".format(
incorrect_datatype, given_types, accepted_types
)
)
Expand All @@ -135,10 +179,10 @@ def validate_config_repeated_datatypes(
"""

mismatched_types = [
x
for x in responses_keep_cols
if (x in contributors_keep_cols)
and (responses_keep_cols[x] != contributors_keep_cols[x])
key
for key in responses_keep_cols
if (key in contributors_keep_cols)
and (responses_keep_cols[key] != contributors_keep_cols[key])
]
if mismatched_types:
# Warning to catch if the same column name has different types
Expand Down
46 changes: 43 additions & 3 deletions tests/test_data_cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
import pandas as pd
from pandas.testing import assert_frame_equal

from mbs_results.data_cleaning import enforce_datatypes
from mbs_results.validation_checks import validate_config_datatype_input
from mbs_results.data_cleaning import clean_and_merge, enforce_datatypes


def correct_types(df):
Expand All @@ -31,7 +30,48 @@ def test_enforce_datatypes():
"contributors_keep_cols": {"reference": "int", "target_variable": "float"},
"temporarily_remove_cols": [],
}
validate_config_datatype_input(**test_setup_config)
actual_output = enforce_datatypes(df_subset, **test_setup_config)

assert_frame_equal(actual_output, expected_output)


def test_clean_and_merge():
test_setup_config = {
"period": "period",
"reference": "reference",
"responses_keep_cols": {
"period": "date",
"reference": "int",
"target_variable": "float",
},
"contributors_keep_cols": {
"reference": "int",
"period": "date",
"strata": "str",
},
"temporarily_remove_cols": [],
}
snapshot = {
"responses": [
{
"reference": "1",
"period": "202212",
"target_variable": "20",
"lastupdateddate": "2024-06-28 00:00:01.999999+00",
}
],
"contributors": [
{"reference": "1", "period": "202212", "survey": "202", "strata": "101"}
],
}
actual_output = clean_and_merge(snapshot, **test_setup_config)
dictionary_data = {
"reference": ["1"],
"period": ["202212"],
"target_variable": ["20"],
"strata": ["101"],
}
expected_output = pd.DataFrame(data=dictionary_data).set_index(
["reference", "period"]
)
assert_frame_equal(actual_output, expected_output)
99 changes: 99 additions & 0 deletions tests/test_validation_checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import pandas as pd
import pytest

from mbs_results.validation_checks import (
colnames_clash,
period_and_reference_not_given,
validate_config_datatype_input,
validate_config_repeated_datatypes,
validate_indices,
)


def test_colnames_clash():
test_config = {
"period": "period",
"reference": "reference",
"responses_keep_cols": {
"period": "date",
"reference": "int",
"strata": "str",
},
"contributors_keep_cols": {
"reference": "int",
"period": "date",
"strata": "str",
},
"temporarily_remove_cols": [],
}
assert colnames_clash(**test_config) is True


def test_period_and_reference_not_given():
test_config = {
"period": "period",
"reference": "reference",
"responses_keep_cols": {
"reference": "int",
"strata": "str",
},
"contributors_keep_cols": {
"reference": "int",
"period": "date",
"strata": "str",
},
"temporarily_remove_cols": [],
}
assert period_and_reference_not_given(**test_config) is True


def test_validate_indices():
dictionary_data = {
"reference": ["1"],
"period": ["202212"],
"target_variable": ["20"],
"strata": ["101"],
}
responses = pd.DataFrame(data=dictionary_data).set_index(["reference", "period"])
dictionary_data["reference"] = ["2"]
contributors = pd.DataFrame(data=dictionary_data).set_index(["reference", "period"])
with pytest.raises(ValueError):
validate_indices(responses, contributors)


def test_validate_config_datatype_input():
test_config = {
"period": "period",
"reference": "reference",
"responses_keep_cols": {
"period": "date",
"reference": "int",
},
"contributors_keep_cols": {
"reference": "int",
"period": "date",
"strata": "string",
},
"temporarily_remove_cols": [],
}
with pytest.raises(ValueError):
validate_config_datatype_input(**test_config)


def test_validate_config_repeated_datatypes():
test_config = {
"period": "period",
"reference": "reference",
"responses_keep_cols": {
"period": "date",
"reference": "int",
},
"contributors_keep_cols": {
"reference": "int",
"period": "str",
"strata": "str",
},
"temporarily_remove_cols": [],
}
with pytest.raises(ValueError):
validate_config_repeated_datatypes(**test_config)

0 comments on commit 42dd43f

Please sign in to comment.