From 5c8d942e22e6a02de13386b66be26390556a337a Mon Sep 17 00:00:00 2001 From: mboudet Date: Tue, 18 Oct 2022 09:43:24 +0200 Subject: [PATCH] Added tests, before/after for time and date, and better regex for GPS (#15) --- README.md | 16 +++--- checkcel/validators.py | 113 ++++++++++++++++++++++++++++++++++++++--- tests/test_validate.py | 36 +++++++++++++ 3 files changed, 151 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 062a0d8..a3446a0 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ In all cases, you will need to at least include a list of validators and associa * *ignore_space* (Default False): whether to trim the values for spaces before checking validity * *ignore_case* (Default False): whether to ignore the case -The last 3 parameters will affect all the validators (when relevant), but can be overriden at the validator level (eg, you can set 'empty_ok' to True for all, but set it to False for a specific validator). +The last 3 parameters will affect all the validators (when relevant), but can be overriden at the validator level (eg, you can set 'empty_ok' to True for all, but set it to False for a specific validator). ## Python format @@ -165,13 +165,17 @@ All validators (except NoValidator) have the 'empty_ok' option, which will consi * *valid_values*: Dict with the *linked_column* values as keys, and list of valid values as values * Ex: {"Test": ['1', '2'], "Test2": ['3', '4']} * EmailValidator(empty_ok=False) -* DateValidator(day_first=True, empty_ok=False) - * Validate that a value is a date. +* DateValidator(day_first=True, empty_ok=False, before=None, after=None) + * Validate that a value is a date. * *day_first* (Default True): Whether to consider the day as the first part of the date for ambiguous values. -* TimeValidator(empty_ok=False) + * *before* Latest date allowed + * *after*: Earliest date allowed +* TimeValidator(empty_ok=False, before=None, after=None) * Validate that a value is a time of the day + * *before* Latest value allowed + * *after*: Earliest value allowed * UniqueValidator(unique_with=[], empty_ok=False) - * Validate that a column has only unique values. + * Validate that a column has only unique values. * *unique_with*: List of column names if you need a tuple of column values to be unique. * Ex: *I want the tuple (value of column A, value of column B) to be unique* * OntologyValidator(ontology, root_term="", empty_ok=False) @@ -195,5 +199,5 @@ All validators (except NoValidator) have the 'empty_ok' option, which will consi * RegexValidator(regex, excel_formulat="", empty_ok=False) * Validate that a term match a specific regex * **No in-file validation generated** *unless using excel_formula* - * *excel_formula*: Custom rules for in-file validation. [Examples here](http://www.contextures.com/xlDataVal07.html). + * *excel_formula*: Custom rules for in-file validation. [Examples here](http://www.contextures.com/xlDataVal07.html). * "{CNAME}" will be replaced by the appropriate column name diff --git a/checkcel/validators.py b/checkcel/validators.py index 9930e26..535617c 100644 --- a/checkcel/validators.py +++ b/checkcel/validators.py @@ -314,10 +314,25 @@ def describe(self, column_name): class DateValidator(Validator): """ Validates that a field is a Date """ - def __init__(self, day_first=True, **kwargs): + def __init__(self, day_first=True, before=None, after=None, **kwargs): super(DateValidator, self).__init__(**kwargs) self.day_first = day_first + if before: + try: + parser.parse(before).date() + except parser.ParserError as e: + raise BadValidatorException(e) + + if after: + try: + parser.parse(after).date() + except parser.ParserError as e: + raise BadValidatorException(e) + + self.before = before + self.after = after + def validate(self, field, row_number, row={}): if self.ignore_space: field = field.strip() @@ -326,7 +341,17 @@ def validate(self, field, row_number, row={}): if field or not self.empty_ok: # Pandas auto convert fields into dates (ignoring the parse_dates=False) field = str(field) - parser.parse(field, dayfirst=self.day_first).date() + date = parser.parse(field, dayfirst=self.day_first).date() + + if self.before and not date < parser.parse(self.before, dayfirst=self.day_first).date(): + self.invalid_dict["invalid_set"].add(field) + self.invalid_dict["invalid_rows"].add(row_number) + raise ValidationException("Value {} is not before {}".format(field, self.before)) + + if self.after and not date > parser.parse(self.after, dayfirst=self.day_first).date(): + self.invalid_dict["invalid_set"].add(field) + self.invalid_dict["invalid_rows"].add(row_number) + raise ValidationException("Value {} is not after {}".format(field, self.after)) except parser.ParserError as e: self.invalid_dict["invalid_set"].add(field) @@ -339,20 +364,58 @@ def bad(self): def generate(self, column, additional_column=None, additional_worksheet=None): # GreaterThanOrEqual for validity with ODS. - dv = DataValidation(type="date", formula1='01/01/1900', operator='greaterThanOrEqual') + params = {"type": "date"} + if (self.before is not None and self.after is not None): + params["formula1"] = parser.parse(self.after).strftime("%Y/%m/%d") + params["formula2"] = parser.parse(self.before).strftime("%Y/%m/%d") + params["operator"] = "between" + elif self.before is not None: + params["formula1"] = parser.parse(self.before).strftime("%Y/%m/%d") + params["operator"] = "lessThanOrEqual" + elif self.after is not None: + params["formula1"] = parser.parse(self.after).strftime("%Y/%m/%d") + params["operator"] = "greaterThanOrEqual" + + dv = DataValidation(**params) dv.add("{}2:{}1048576".format(column, column)) return dv def describe(self, column_name): - return "{} : Date {}".format(column_name, "(required)" if not self.empty_ok else "") + text = "{} : Date".format(column_name) + if (self.after is not None and self.before is not None): + text += " ({} - {})".format(self.after, self.before) + elif self.after is not None: + text += " >= {}".format(self.after) + elif self.before is not None: + text += " <= {}".format(self.before) + + if not self.empty_ok: + text += " (required)" + + return text class TimeValidator(Validator): """ Validates that a field is a Time """ - def __init__(self, **kwargs): + def __init__(self, before=None, after=None, **kwargs): super(TimeValidator, self).__init__(**kwargs) + if before: + try: + parser.parse(before).time() + except parser.ParserError as e: + raise BadValidatorException(e) + + if after: + try: + parser.parse(after).time() + except parser.ParserError as e: + raise BadValidatorException(e) + + self.before = before + self.after = after + def validate(self, field, row_number, row={}): if self.ignore_space: field = field.strip() @@ -360,7 +423,17 @@ def validate(self, field, row_number, row={}): if field or not self.empty_ok: # Pandas auto convert fields into dates (ignoring the parse_dates=False) field = str(field) - parser.parse(field).time() + time = parser.parse(field).time() + + if self.before and not time < parser.parse(self.before).time(): + self.invalid_dict["invalid_set"].add(field) + self.invalid_dict["invalid_rows"].add(row_number) + raise ValidationException("Value {} is not before {}".format(field, self.before)) + + if self.after and not time > parser.parse(self.after).time(): + self.invalid_dict["invalid_set"].add(field) + self.invalid_dict["invalid_rows"].add(row_number) + raise ValidationException("Value {} is not after {}".format(field, self.after)) except parser.ParserError as e: self.invalid_dict["invalid_set"].add(field) @@ -373,12 +446,36 @@ def bad(self): def generate(self, column, additional_column=None, additional_worksheet=None): # GreaterThanOrEqual for validity with ODS. - dv = DataValidation(type="time") + + params = {"type": "time"} + if (self.before is not None and self.after is not None): + params["formula1"] = parser.parse(self.after).strftime("%H:%M:%S") + params["formula2"] = parser.parse(self.before).strftime("%H:%M:%S") + params["operator"] = "between" + elif self.before is not None: + params["formula1"] = parser.parse(self.before).strftime("%H:%M:%S") + params["operator"] = "lessThanOrEqual" + elif self.after is not None: + params["formula1"] = parser.parse(self.after).strftime("%H:%M:%S") + params["operator"] = "greaterThanOrEqual" + + dv = DataValidation(**params) dv.add("{}2:{}1048576".format(column, column)) return dv def describe(self, column_name): - return "{} : Time {}".format(column_name, "(required)" if not self.empty_ok else "") + text = "{} : Time".format(column_name) + if (self.after is not None and self.before is not None): + text += " ({} - {})".format(self.after, self.before) + elif self.after is not None: + text += " >= {}".format(self.after) + elif self.before is not None: + text += " <= {}".format(self.before) + + if not self.empty_ok: + text += " (required)" + + return text class EmailValidator(Validator): diff --git a/tests/test_validate.py b/tests/test_validate.py index 36c94f1..d007808 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -209,6 +209,24 @@ def test_invalid(self): assert val is False assert len(validation.failures['my_column']) == 2 + def test_invalid_before(self): + data = {'my_column': ['01/01/2000', '10/10/2010']} + validators = {'my_column': DateValidator(before="05/05/2005")} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + + def test_invalid_after(self): + data = {'my_column': ['01/01/2000', '10/10/2010']} + validators = {'my_column': DateValidator(after="05/05/2005")} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_invalid_empty(self): data = {'my_column': ['01/01/1970', '']} validators = {'my_column': DateValidator()} @@ -244,6 +262,24 @@ def test_invalid(self): assert val is False assert len(validation.failures['my_column']) == 2 + def test_invalid_before(self): + data = {'my_column': ['14h23', '16h30']} + validators = {'my_column': TimeValidator(before="15h00")} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + + def test_invalid_after(self): + data = {'my_column': ['14h23', '16h30']} + validators = {'my_column': TimeValidator(after="15h00")} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_invalid_empty(self): data = {'my_column': ['13h10', '']} validators = {'my_column': TimeValidator()}