From 89a51dfaaab7289bf5d56febba518d8efb140023 Mon Sep 17 00:00:00 2001 From: mboudet Date: Thu, 3 Nov 2022 12:06:06 +0100 Subject: [PATCH] Add empty_ok_if and empty_ok_unless (#17) --- CHANGELOG.md | 16 ++++ README.md | 7 ++ checkcel/checkcel.py | 2 +- checkcel/validators.py | 172 +++++++++++++++++++++++++++++++++++------ tests/test_validate.py | 80 +++++++++++++++++++ 5 files changed, 252 insertions(+), 25 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3dae381 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,16 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +This changelog was started for release 0.0.3. + +## [0.0.3] - Unreleased + +### Added + +- empty_ok_if key for validator +- empty_ok_unless key for validator +- readme key for validator diff --git a/README.md b/README.md index a3446a0..a741f94 100644 --- a/README.md +++ b/README.md @@ -105,8 +105,15 @@ In all cases, you will need to at least include a list of validators and associa * *metadata*: A list of column names. This will create a metadata sheet with these columns, without validation on them * *empty_ok* (Default False): Whether to accept empty values as valid +* *empty_ok_if* (Default None): Accept empty value as valid if **another column** value is set + * Accept either a string (column name), a list (list of column names), or a dict + * The dict keys must be column names, and the values lists of 'accepted values'. The current column will accept empty values if the related column's value is in the list of accepted values +* *empty_ok_unless* (Default None): Accept empty value as valid *unless* **another column** value is set + * Accept either a string (column name), a list (list of column names), or a dict + * The dict keys must be column names, and the values lists of 'rejected values'. The current column will accept empty values if the related column's value is **not** in the list of reject values * *ignore_space* (Default False): whether to trim the values for spaces before checking validity * *ignore_case* (Default False): whether to ignore the case +* *readme* (Default None): Additional information to include on the readme page The last 3 parameters will affect all the validators (when relevant), but can be overriden at the validator level (eg, you can set 'empty_ok' to True for all, but set it to False for a specific validator). diff --git a/checkcel/checkcel.py b/checkcel/checkcel.py index 35957d3..4520d11 100644 --- a/checkcel/checkcel.py +++ b/checkcel/checkcel.py @@ -144,7 +144,7 @@ def _validate(self, row): if column in self.validators: validator = self.validators[column] try: - validator.validate(row[column], self.line_count, row=row) + validator.validate(row[column], self.line_count, row) except ValidationException as e: self.failures[column][self.line_count].append(e) validator.fail_count += 1 diff --git a/checkcel/validators.py b/checkcel/validators.py index 535617c..586a923 100644 --- a/checkcel/validators.py +++ b/checkcel/validators.py @@ -18,13 +18,68 @@ class Validator(object): """ Generic Validator class """ - def __init__(self, empty_ok=None, ignore_case=None, ignore_space=None): + def __init__(self, empty_ok=None, ignore_case=None, ignore_space=None, empty_ok_if=None, empty_ok_unless=None, readme=None): self.logger = logs.logger self.invalid_dict = defaultdict(set) self.fail_count = 0 self.empty_ok = empty_ok self.ignore_case = ignore_case self.ignore_space = ignore_space + self.empty_ok_if = empty_ok_if + self.empty_ok_unless = empty_ok_unless + self.empty_check = True if not (empty_ok_if or empty_ok_unless) else False + self.readme = readme + + if empty_ok_if: + if not (isinstance(empty_ok_if, dict) or isinstance(empty_ok_if, list) or isinstance(empty_ok_if, str)): + raise BadValidatorException("empty_ok_if must be a dict, a list, or a string") + + if empty_ok_unless: + if not (isinstance(empty_ok_unless, dict) or isinstance(empty_ok_unless, list) or isinstance(empty_ok_unless, str)): + raise BadValidatorException("empty_ok_unless must be a dict, a list, or a string") + + if empty_ok_if and empty_ok_unless: + raise BadValidatorException("Cannot use both empty_ok_if and empty_ok_unless") + + def _precheck_empty_ok_if(self, row): + if self.empty_ok_if: + if isinstance(self.empty_ok_if, dict): + linked_columns = self.empty_ok_if.keys() + elif isinstance(self.empty_ok_if, str): + linked_columns = [self.empty_ok_if] + elif isinstance(self.empty_ok_if, list): + linked_columns = self.empty_ok_if + if not all([col in row for col in linked_columns]): + raise BadValidatorException("One of more linked column for empty_ok_if {} is not in file columns".format(linked_columns)) + if self.empty_ok_unless: + if isinstance(self.empty_ok_unless, dict): + linked_columns = self.empty_ok_unless.keys() + elif isinstance(self.empty_ok_unless, str): + linked_columns = [self.empty_ok_unless] + elif isinstance(self.empty_ok_unless, list): + linked_columns = self.empty_ok_unless + if not all([col in row for col in linked_columns]): + raise BadValidatorException("One of more linked column for empty_ok_unless {} is not in file columns".format(linked_columns)) + self.empty_check = True + + def _can_be_empty(self, row): + if self.empty_ok_if: + if isinstance(self.empty_ok_if, dict): + return all([row[key] in values for key, values in self.empty_ok_if.items()]) + elif isinstance(self.empty_ok_if, str): + return row[self.empty_ok_if] != "" + elif isinstance(self.empty_ok_if, list): + return all([row[col] != "" for col in self.empty_ok_if]) + + if self.empty_ok_unless: + if isinstance(self.empty_ok_unless, dict): + return all([row[key] not in values for key, values in self.empty_ok_unless.items()]) + elif isinstance(self.empty_ok_unless, str): + return row[self.empty_ok_unless] == "" + elif isinstance(self.empty_ok_unless, list): + return all([row[col] == "" for col in self.empty_ok_unless]) + + return self.empty_ok @property def bad(self): @@ -65,7 +120,9 @@ def generate(self, column): return None def describe(self, column_name): - return "{} : Free value".format(column_name) + if self.readme: + column_name += " ({})".format(self.readme) + return "{}: Free value".format(column_name) @property def bad(self): @@ -78,8 +135,11 @@ class TextValidator(Validator): def __init__(self, **kwargs): super(TextValidator, self).__init__(**kwargs) - def validate(self, field, row_number, row={}): - if not field and not self.empty_ok: + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + + if not field and not self._can_be_empty(row): raise ValidationException( "Field cannot be empty" ) @@ -92,6 +152,8 @@ def generate(self, column): return None def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) return "{} : Free text {}".format(column_name, "(required)" if not self.empty_ok else "") @@ -103,12 +165,15 @@ def __init__(self, min=None, max=None, **kwargs): self.min = min self.max = max - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() try: - if field or not self.empty_ok: + if field or not self._can_be_empty(row): field = self.cast(field) if self.min is not None and field < self.min: self.invalid_dict["invalid_set"].add(field) @@ -145,6 +210,9 @@ def generate(self, column): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) + text = "{} : {} number".format(column_name, self.type.capitalize()) if (self.min is not None and self.max is not None): text += " ({} - {})".format(self.min, self.max) @@ -186,12 +254,22 @@ def __init__(self, valid_values=set(), **kwargs): if self.empty_ok: self.valid_values.add("") - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_case: field = field.lower() if self.ignore_space: field = field.strip() + if not (field or self._can_be_empty(row)): + self.invalid_dict["invalid_set"].add(field) + self.invalid_dict["invalid_rows"].add(row_number) + raise ValidationException( + "'{}' is invalid".format(field) + ) + if field not in self.valid_values: self.invalid_dict["invalid_set"].add(field) self.invalid_dict["invalid_rows"].add(row_number) @@ -238,6 +316,8 @@ def generate(self, column, column_name="", additional_column=None, additional_wo return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) return "{} : (Allowed values : {}) {}".format(column_name, ", ".join(self.ordered_values), "(required)" if not self.empty_ok else "") @@ -263,8 +343,10 @@ def validate(self, field, row_number, row): if not self.column_check: self._precheck_unique_with(row) - if field == "" and self.empty_ok: + + if not field and self.empty_ok: return + related_column_value = row[self.linked_column] if not related_column_value: self.invalid_dict["invalid_rows"].add(row_number) @@ -308,6 +390,8 @@ def generate(self, column, set_columns, column_name, additional_column, addition return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) return "{} : Linked values to column {} {}".format(column_name, self.linked_column, "(required)" if not self.empty_ok else "") @@ -333,12 +417,15 @@ def __init__(self, day_first=True, before=None, after=None, **kwargs): self.before = before self.after = after - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() try: - if field or not self.empty_ok: + if field or not self._can_be_empty(row): # Pandas auto convert fields into dates (ignoring the parse_dates=False) field = str(field) date = parser.parse(field, dayfirst=self.day_first).date() @@ -381,6 +468,8 @@ def generate(self, column, additional_column=None, additional_worksheet=None): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) text = "{} : Date".format(column_name) if (self.after is not None and self.before is not None): text += " ({} - {})".format(self.after, self.before) @@ -416,11 +505,14 @@ def __init__(self, before=None, after=None, **kwargs): self.before = before self.after = after - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() try: - if field or not self.empty_ok: + if field or not self._can_be_empty(row): # Pandas auto convert fields into dates (ignoring the parse_dates=False) field = str(field) time = parser.parse(field).time() @@ -464,6 +556,8 @@ def generate(self, column, additional_column=None, additional_worksheet=None): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) text = "{} : Time".format(column_name) if (self.after is not None and self.before is not None): text += " ({} - {})".format(self.after, self.before) @@ -484,10 +578,13 @@ class EmailValidator(Validator): def __init__(self, **kwargs): super(EmailValidator, self).__init__(**kwargs) - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() - if field or not self.empty_ok: + if field or not self._can_be_empty(row): try: validate_email(field) except EmailNotValidError as e: @@ -508,6 +605,8 @@ def generate(self, column, ontology_column=None): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) return "{} : Email {}".format(column_name, "(required)" if not self.empty_ok else "") @@ -527,11 +626,14 @@ def __init__(self, ontology, root_term="", **kwargs): if self.root_term and not self.root_term_iri: raise BadValidatorException("'{}' is not a valid root term for ontology {}".format(self.root_term, self.ontology)) - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() - if field == "" and self.empty_ok: + if field == "" and self._can_be_empty(row): return if field in self.invalid_dict["invalid_set"]: @@ -567,6 +669,8 @@ def generate(self, column, additional_column, additional_worksheet): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) text = "{} : Ontological term from {} ontology.".format(column_name, self.ontology) if self.root_term: text += " Root term is : {}".format(self.root_term) @@ -643,12 +747,15 @@ def _precheck_unique_with(self, row): raise BadValidatorException(extra) self.unique_check = True - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() if not field: - if self.empty_ok: + if self._can_be_empty(row): return else: raise ValidationException( @@ -693,6 +800,8 @@ def generate(self, column, column_dict): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) text = "{} : Unique value".format(column_name) if self.unique_with: text += " Must be unique with column(s) {}".format(", ".join(self.unique_with)) @@ -723,11 +832,14 @@ def __init__(self, root_term="", lang="en", labellang="en", vocab="thesaurus-inr if not exists: raise BadValidatorException("'{}' is not a valid root term. Make sure it is a concept, and not a microthesaurus or group".format(self.root_term)) - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() - if field == "" and self.empty_ok: + if field == "" and self._can_be_empty(row): return if field in self.invalid_dict["invalid_set"]: @@ -778,6 +890,8 @@ def generate(self, column, additional_column, additional_worksheet): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) text = "{} : Ontological term from Vocabulaires ouverts.".format(column_name) if self.root_term: text += " Root term is : {}".format(self.root_term) @@ -846,11 +960,14 @@ def __init__(self, regex, excel_formula="", **kwargs): except re.error: raise BadValidatorException("'{}' is not a valid regular expression".format(self.regex)) - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() - if field == "" and self.empty_ok: + if field == "" and self._can_be_empty(row): return matches = re.findall(self.regex, field) @@ -879,6 +996,8 @@ def generate(self, column): return dv def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) text = "{} : Term matching the regex {}.".format(column_name, self.regex) if not self.empty_ok: text += " (required)" @@ -901,11 +1020,14 @@ def __init__(self, format="DD", only_long=False, only_lat=False, **kwargs): self.only_long = only_long self.only_lat = only_lat - def validate(self, field, row_number, row={}): + def validate(self, field, row_number, row): + if not self.empty_check: + self._precheck_empty_ok_if(row) + if self.ignore_space: field = field.strip() - if field == "" and self.empty_ok: + if field == "" and self._can_be_empty(row): return if self.format == "DD": @@ -941,6 +1063,8 @@ def generate(self, column): return None def describe(self, column_name): + if self.readme: + column_name += " ({})".format(self.readme) text = "{} : GPS coordinate".format(column_name) if not self.empty_ok: text += " (required)" diff --git a/tests/test_validate.py b/tests/test_validate.py index d007808..89a0b7a 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -499,3 +499,83 @@ def test_valid(self): df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, validators=validators) assert val.validate() + + +class TestCheckcelValidateEmpty_if(): + + def test_invalid_string(self): + data = {'my_column': ["", "not_empty"], "another_column": ["", ""]} + validators = { + 'my_column': TextValidator(empty_ok=True), + 'another_column': TextValidator(empty_ok_if="my_column") + } + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['another_column']) == 1 + + def test_invalid_list(self): + data = {'my_column': ["", "", "not_empty", "not_empty"], 'my_column2': ["", "not_empty", "", "not_empty"], "another_column": ["", "", "", ""]} + validators = { + 'my_column': TextValidator(empty_ok=True), + 'my_column2': TextValidator(empty_ok=True), + 'another_column': TextValidator(empty_ok_if=["my_column", "my_column2"]) + } + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['another_column']) == 3 + + def test_invalid_dict(self): + data = data = {'my_column': ["", "invalid_value", "valid_value"], "another_column": ["", "", ""]} + validators = { + 'my_column': TextValidator(empty_ok=True), + 'another_column': TextValidator(empty_ok_if={"my_column": ["valid_value"]}) + } + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['another_column']) == 2 + + +class TestCheckcelValidateEmpty_unless(): + + def test_invalid_string(self): + data = {'my_column': ["", "not_empty"], "another_column": ["", ""]} + validators = { + 'my_column': TextValidator(empty_ok=True), + 'another_column': TextValidator(empty_ok_unless="my_column") + } + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['another_column']) == 1 + + def test_invalid_list(self): + data = {'my_column': ["", "", "not_empty", "not_empty"], 'my_column2': ["", "not_empty", "", "not_empty"], "another_column": ["", "", "", ""]} + validators = { + 'my_column': TextValidator(empty_ok=True), + 'my_column2': TextValidator(empty_ok=True), + 'another_column': TextValidator(empty_ok_unless=["my_column", "my_column2"]) + } + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['another_column']) == 3 + + def test_invalid_dict(self): + data = data = {'my_column': ["", "invalid_value", "valid_value"], "another_column": ["", "", ""]} + validators = { + 'my_column': TextValidator(empty_ok=True), + 'another_column': TextValidator(empty_ok_unless={"my_column": ["invalid_value"]}) + } + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['another_column']) == 1