diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dae381..dedf557 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,3 +14,14 @@ This changelog was started for release 0.0.3. - empty_ok_if key for validator - empty_ok_unless key for validator - readme key for validator +- unique key for validator +- expected_rows key for templates +- logs parameters for templates + +### Fixed + +- Bug for setValidator when using number values + +### Changed + +- Better validation for integers diff --git a/README.md b/README.md index a741f94..7305b61 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,8 @@ Invalid fields: [''] in rows: [3] IntValidator failed 5 time(s) (100.0%) on field: 'Pierraille surface (25)' ``` +When calling validate() (from python), you can access a list of logs with the 'logs' parameter of the Checkcel/Checkxtractor/Checkerator class + # Python library ```python @@ -104,20 +106,13 @@ Validation templates can use three formats: json/yaml, and python files. In all cases, you will need to at least include a list of validators and associated column names. Several optional parameters are also available : * *metadata*: A list of column names. This will create a metadata sheet with these columns, without validation on them +* *expected_rows*: (Default 0): Number of *data* rows expected * *empty_ok* (Default False): Whether to accept empty values as valid -* *empty_ok_if* (Default None): Accept empty value as valid if **another column** value is set - * Accept either a string (column name), a list (list of column names), or a dict - * The dict keys must be column names, and the values lists of 'accepted values'. The current column will accept empty values if the related column's value is in the list of accepted values -* *empty_ok_unless* (Default None): Accept empty value as valid *unless* **another column** value is set - * Accept either a string (column name), a list (list of column names), or a dict - * The dict keys must be column names, and the values lists of 'rejected values'. The current column will accept empty values if the related column's value is **not** in the list of reject values * *ignore_space* (Default False): whether to trim the values for spaces before checking validity * *ignore_case* (Default False): whether to ignore the case -* *readme* (Default None): Additional information to include on the readme page The last 3 parameters will affect all the validators (when relevant), but can be overriden at the validator level (eg, you can set 'empty_ok' to True for all, but set it to False for a specific validator). - ## Python format A template needs to contain a class inheriting the Checkplate class. @@ -147,13 +142,29 @@ If needed, these dictionnaries can include an 'options' key, containing a dictio ## Validators -All validators (except NoValidator) have the 'empty_ok' option, which will consider empty values as valid. -*As in-file validation for non-empty values is unreliable, the non-emptyness is not checked in-file* +### Global options + +All validators (except NoValidator) have these options available. If relevant, these options will override the ones set at the template-level + +* *empty_ok* (Default False): Whether to accept empty values as valid (Not enforced in excel) +* *empty_ok_if* (Default None): Accept empty value as valid if **another column** value is set + * Accept either a string (column name), a list (list of column names), or a dict (Not enforced in excel) + * The dict keys must be column names, and the values lists of 'accepted values'. The current column will accept empty values if the related column's value is in the list of accepted values +* *empty_ok_unless* (Default None): Accept empty value as valid *unless* **another column** value is set. (Not enforced in excel) + * Accept either a string (column name), a list (list of column names), or a dict + * The dict keys must be column names, and the values lists of 'rejected values'. The current column will accept empty values if the related column's value is **not** in the list of reject values +* *ignore_space* (Default False): whether to trim the values for spaces before checking validity +* *ignore_case* (Default False): whether to ignore the case +* *unique* (Default False): whether to enforce unicity for this column. (Not enforced in excel yet, except if there are not other validation (ie TextValidator and RegexValidator in some cases)) + +*As excel validation for non-empty values is unreliable, the non-emptiness cannot be properly enforced in excel files* + +### Validator-specific options * NoValidator (always True) * **No in-file validation generated** * TextValidator(empty_ok=False) - * **No in-file validation generated** + * **No in-file validation generated** (unless *unique* is set) * IntValidator(min="", max="", empty_ok=False) * Validate that a value is an integer * *min*: Minimal value allowed diff --git a/checkcel/checkcel.py b/checkcel/checkcel.py index 4520d11..97e9f1d 100644 --- a/checkcel/checkcel.py +++ b/checkcel/checkcel.py @@ -41,16 +41,16 @@ def __init__( def _log_debug_failures(self): for field_name, field_failure in self.failures.items(): - self.logger.debug('\nFailure on field: "{}":'.format(field_name)) + self.debug('\nFailure on field: "{}":'.format(field_name)) for i, (row, errors) in enumerate(field_failure.items()): - self.logger.debug(" {}:{}".format(self.source, row)) + self.debug(" {}:{}".format(self.source, row)) for error in errors: - self.logger.debug(" {}".format(error)) + self.debug(" {}".format(error)) def _log_validator_failures(self): for field_name, validator in self.validators.items(): if validator.bad: - self.logger.error( + self.error( " {} failed {} time(s) ({:.1%}) on field: '{}'".format( validator.__class__.__name__, validator.fail_count, @@ -64,22 +64,22 @@ def _log_validator_failures(self): data = validator.bad wrong_terms = ", ".join(["'{}'".format(val) for val in data["invalid_set"]]) wrong_rows = ", ".join([str(val) for val in data["invalid_rows"]]) - self.logger.error( + self.error( " Invalid fields: [{}] in rows: [{}]".format(wrong_terms, wrong_rows) ) except TypeError as e: raise e def _log_missing_validators(self): - self.logger.error(" Missing validators for:") + self.error(" Missing validators for:") self._log_missing(self.missing_validators) def _log_missing_fields(self): - self.logger.error(" Missing expected fields:") + self.error(" Missing expected fields:") self._log_missing(self.missing_fields) def _log_missing(self, missing_items): - self.logger.error( + self.error( "{}".format( "\n".join( [" '{}': [],".format(field) for field in sorted(missing_items)] @@ -88,7 +88,7 @@ def _log_missing(self, missing_items): ) def validate(self): - self.logger.info( + self.info( "\nValidating {}{}".format(self.__class__.__name__, "(source={})".format(self.source) if self.source else "") ) @@ -101,7 +101,7 @@ def validate(self): df = pandas.read_csv(self.source, sep=self.delimiter, skiprows=self.row) if len(df) == 0: - self.logger.info( + self.info( "\033[1;33m" + "Source file has no data" + "\033[0m" ) return False @@ -115,7 +115,7 @@ def validate(self): validator_set = set(self.validators) self.missing_validators = self.column_set - validator_set if self.missing_validators: - self.logger.info("\033[1;33m" + "Missing..." + "\033[0m") + self.info("\033[1;33m" + "Missing..." + "\033[0m") self._log_missing_validators() if not self.ignore_missing_validators: @@ -123,20 +123,25 @@ def validate(self): self.missing_fields = validator_set - self.column_set if self.missing_fields: - self.logger.info("\033[1;33m" + "Missing..." + "\033[0m") + self.info("\033[1;33m" + "Missing..." + "\033[0m") self._log_missing_fields() return False + if self.expected_rows: + if not self.expected_rows == len(df.index): + self.error("Length issue: Expecting {} row(s), found {}".format(self.expected_rows, len(df.index))) + return False + # Might be a way to do it more efficiently.. df.apply(lambda row: self._validate(row), axis=1) if self.failures: - self.logger.info("\033[0;31m" + "Failed" + "\033[0m") + self.info("\033[0;31m" + "Failed" + "\033[0m") self._log_debug_failures() self._log_validator_failures() return False else: - self.logger.info("\033[0;32m" + "Passed" + "\033[0m") + self.info("\033[0;32m" + "Passed" + "\033[0m") return True def _validate(self, row): diff --git a/checkcel/checkplate.py b/checkcel/checkplate.py index 40a3396..6985166 100644 --- a/checkcel/checkplate.py +++ b/checkcel/checkplate.py @@ -15,18 +15,36 @@ class Checkplate(object): """ Base class for templates """ - def __init__(self, validators={}, empty_ok=False, ignore_case=False, ignore_space=False, metadata=[]): + def __init__(self, validators={}, empty_ok=False, ignore_case=False, ignore_space=False, metadata=[], expected_rows=None): self.metadata = metadata self.logger = logs.logger self.validators = validators or getattr(self, "validators", {}) + self.logs = [] # Will be overriden by validators config self.empty_ok = empty_ok self.ignore_case = ignore_case self.ignore_space = ignore_space + self.expected_rows = expected_rows # self.trim_values = False for validator in self.validators.values(): validator._set_attributes(self.empty_ok, self.ignore_case, self.ignore_space) + def debug(self, message): + self.logger.debug(message) + self.logs.append("Debug: {}".format(message)) + + def info(self, message): + self.logger.info(message) + self.logs.append("Info: {}".format(message)) + + def warn(self, message): + self.logger.warn(message) + self.logs.append("Warning: {}".format(message)) + + def error(self, message): + self.logger.error(message) + self.logs.append("Error: {}".format(message)) + def load_from_python_file(self, file_path): # Limit conflicts in file name with tempfile.TemporaryDirectory() as dirpath: @@ -44,7 +62,7 @@ def load_from_python_file(self, file_path): custom_class = list(filtered_classes.values())[0] if not custom_class: - self.logger.error( + self.error( "Could not find a subclass of Checkplate in the provided file." ) return exits.UNAVAILABLE @@ -53,13 +71,21 @@ def load_from_python_file(self, file_path): self.empty_ok = getattr(custom_class, 'empty_ok', False) self.ignore_case = getattr(custom_class, 'ignore_case', False) self.ignore_space = getattr(custom_class, 'ignore_space', False) + self.expected_rows = getattr(custom_class, 'expected_rows', 0) + try: + self.expected_rows = int(self.expected_rows) + except ValueError: + self.error( + "Malformed Checkcel template: expected_rows is not an integer" + ) + for key, validator in self.validators.items(): validator._set_attributes(self.empty_ok, self.ignore_case, self.ignore_space) return self def load_from_json_file(self, file_path): if not os.path.isfile(file_path): - self.logger.error( + self.error( "Could not find a file at path {}".format(file_path) ) return exits.NOINPUT @@ -71,7 +97,7 @@ def load_from_json_file(self, file_path): def load_from_yaml_file(self, file_path): if not os.path.isfile(file_path): - self.logger.error( + self.error( "Could not find a file at path {}".format(file_path) ) return exits.NOINPUT @@ -80,7 +106,7 @@ def load_from_yaml_file(self, file_path): try: data = yaml.safe_load(f) except yaml.YAMLError: - self.logger.error( + self.error( "File {} is not a valid yaml file".format(file_path) ) return exits.UNAVAILABLE @@ -104,7 +130,7 @@ def _is_valid_template(self, tup): def _load_from_dict(self, data): if 'validators' not in data or not isinstance(data['validators'], list): - self.logger.error( + self.error( "Could not find a list of validators in data" ) return exits.UNAVAILABLE @@ -112,13 +138,21 @@ def _load_from_dict(self, data): self.empty_ok = data.get("empty_ok", False) self.ignore_case = data.get('ignore_case', False) self.ignore_space = data.get('ignore_space', False) + self.expected_rows = data.get('expected_rows', 0) + try: + self.expected_rows = int(self.expected_rows) + except ValueError: + self.error( + "Malformed Checkcel template: expected_rows is not an integer" + ) + validators_list = [] self.validators = {} self.metadata = data.get('metadata', []) for validator in data['validators']: if 'type' not in validator or 'name' not in validator: - self.logger.error( + self.error( "Malformed Checkcel Validator. Require both 'type' and 'name' key" ) return exits.UNAVAILABLE @@ -129,7 +163,7 @@ def _load_from_dict(self, data): val = validator_class(**options) val._set_attributes(self.empty_ok, self.ignore_case, self.ignore_space) except AttributeError: - self.logger.error( + self.error( "{} is not a valid Checkcel Validator".format(validator['type']) ) return exits.UNAVAILABLE diff --git a/checkcel/checkxtractor.py b/checkcel/checkxtractor.py index 7e242ae..604d311 100644 --- a/checkcel/checkxtractor.py +++ b/checkcel/checkxtractor.py @@ -1,4 +1,3 @@ -from checkcel import logs from openpyxl import load_workbook from openpyxl.worksheet.cell_range import CellRange from openpyxl.utils import get_column_letter @@ -10,7 +9,6 @@ class Checkxtractor(object): """ Extract validation value from xlsx file (only) """ def __init__(self, source, output, sheet=0, row=0, template_type="python"): - self.logger = logs.logger self.source = source self.output = output self.sheet = int(sheet) diff --git a/checkcel/validators.py b/checkcel/validators.py index 586a923..f9c8e5c 100644 --- a/checkcel/validators.py +++ b/checkcel/validators.py @@ -18,7 +18,7 @@ class Validator(object): """ Generic Validator class """ - def __init__(self, empty_ok=None, ignore_case=None, ignore_space=None, empty_ok_if=None, empty_ok_unless=None, readme=None): + def __init__(self, empty_ok=None, ignore_case=None, ignore_space=None, empty_ok_if=None, empty_ok_unless=None, readme=None, unique=False): self.logger = logs.logger self.invalid_dict = defaultdict(set) self.fail_count = 0 @@ -29,6 +29,8 @@ def __init__(self, empty_ok=None, ignore_case=None, ignore_space=None, empty_ok_ self.empty_ok_unless = empty_ok_unless self.empty_check = True if not (empty_ok_if or empty_ok_unless) else False self.readme = readme + self.unique = unique + self.unique_values = set() if empty_ok_if: if not (isinstance(empty_ok_if, dict) or isinstance(empty_ok_if, list) or isinstance(empty_ok_if, str)): @@ -144,17 +146,29 @@ def validate(self, field, row_number, row): "Field cannot be empty" ) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) + @property def bad(self): return self.invalid_dict def generate(self, column): - return None + if self.unique: + params = {"type": "custom", "allow_blank": self.empty_ok} + internal_value = "${0}:${0},{0}2".format(column) + params["formula1"] = '=COUNTIF({})<2'.format(internal_value) + dv = DataValidation(**params) + dv.error = 'Value must be unique' + dv.add("{}2:{}1048576".format(column, column)) + return dv def describe(self, column_name): if self.readme: column_name += " ({})".format(self.readme) - return "{} : Free text {}".format(column_name, "(required)" if not self.empty_ok else "") + return "{} : Free text {}{}".format(column_name, "(required)" if not self.empty_ok else "", " (unique)" if self.unique else "") class CastValidator(Validator): @@ -174,7 +188,10 @@ def validate(self, field, row_number, row): try: if field or not self._can_be_empty(row): - field = self.cast(field) + field = float(field) + if self.type == "whole" and not (field).is_integer(): + raise ValueError + if self.min is not None and field < self.min: self.invalid_dict["invalid_set"].add(field) self.invalid_dict["invalid_rows"].add(row_number) @@ -184,6 +201,11 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException("{} is above max value {}".format(field, self.max)) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) + except ValueError as e: self.invalid_dict["invalid_set"].add(field) self.invalid_dict["invalid_rows"].add(row_number) @@ -194,7 +216,7 @@ def bad(self): return self.invalid_dict def generate(self, column): - params = {"type": self.type} + params = {"type": self.type, "allow_blank": self.empty_ok} if (self.min is not None and self.max is not None): params["formula1"] = self.min params["formula2"] = self.max @@ -223,6 +245,9 @@ def describe(self, column_name): if not self.empty_ok: text += " (required)" + if self.unique: + text += " (unique)" + return text @@ -231,7 +256,6 @@ class FloatValidator(CastValidator): def __init__(self, **kwargs): super(FloatValidator, self).__init__(**kwargs) - self.cast = float self.type = "decimal" @@ -240,7 +264,6 @@ class IntValidator(CastValidator): def __init__(self, **kwargs): super(IntValidator, self).__init__(**kwargs) - self.cast = int self.type = "whole" @@ -249,8 +272,8 @@ class SetValidator(Validator): def __init__(self, valid_values=set(), **kwargs): super(SetValidator, self).__init__(**kwargs) - self.ordered_values = valid_values - self.valid_values = set(valid_values) + self.ordered_values = [str(val) for val in valid_values] + self.valid_values = set([str(val) for val in valid_values]) if self.empty_ok: self.valid_values.add("") @@ -276,6 +299,10 @@ def validate(self, field, row_number, row): raise ValidationException( "'{}' is invalid".format(field) ) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) def _set_attributes(self, empty_ok_template, ignore_case_template, ignore_space_template): # Override with template value if it was not set (default to None) @@ -299,7 +326,7 @@ def bad(self): def generate(self, column, column_name="", additional_column=None, additional_worksheet=None): # If total length > 256 : need to use cells on another sheet if additional_column and additional_worksheet: - params = {"type": "list"} + params = {"type": "list", "allow_blank": self.empty_ok} cell = additional_worksheet.cell(column=column_index_from_string(additional_column), row=1, value=column_name) cell.font = Font(color="FF0000", bold=True) row = 2 @@ -308,7 +335,7 @@ def generate(self, column, column_name="", additional_column=None, additional_wo row += 1 params["formula1"] = "{}!${}$2:${}${}".format(quote_sheetname(additional_worksheet.title), additional_column, additional_column, row - 1) else: - params = {"type": "list"} + params = {"type": "list", "allow_blank": self.empty_ok} values = ",".join(self.ordered_values) params["formula1"] = '"{}"'.format(values) dv = DataValidation(**params) @@ -318,7 +345,7 @@ def generate(self, column, column_name="", additional_column=None, additional_wo def describe(self, column_name): if self.readme: column_name += " ({})".format(self.readme) - return "{} : (Allowed values : {}) {}".format(column_name, ", ".join(self.ordered_values), "(required)" if not self.empty_ok else "") + return "{} : (Allowed values : {}) {}{}".format(column_name, ", ".join(self.ordered_values), "(required)" if not self.empty_ok else "", "(unique)" if self.unique else "") class LinkedSetValidator(Validator): @@ -361,6 +388,11 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException("Value {} is not in allowed values".format(field)) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) + @property def bad(self): return self.invalid_dict @@ -369,7 +401,7 @@ def generate(self, column, set_columns, column_name, additional_column, addition if self.linked_column not in set_columns: # TODO raise warning return None - params = {"type": "list"} + params = {"type": "list", "allow_blank": self.empty_ok} additional_worksheet.cell(column=column_index_from_string(additional_column), row=1, value=column_name).font = Font(color="FF0000", bold=True) row = 2 row_dict = {} @@ -392,7 +424,7 @@ def generate(self, column, set_columns, column_name, additional_column, addition def describe(self, column_name): if self.readme: column_name += " ({})".format(self.readme) - return "{} : Linked values to column {} {}".format(column_name, self.linked_column, "(required)" if not self.empty_ok else "") + return "{} : Linked values to column {} {}{}".format(column_name, self.linked_column, "(required)" if not self.empty_ok else "", "(unique)" if self.unique else "") class DateValidator(Validator): @@ -440,6 +472,11 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException("Value {} is not after {}".format(field, self.after)) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) + except parser.ParserError as e: self.invalid_dict["invalid_set"].add(field) self.invalid_dict["invalid_rows"].add(row_number) @@ -451,7 +488,7 @@ def bad(self): def generate(self, column, additional_column=None, additional_worksheet=None): # GreaterThanOrEqual for validity with ODS. - params = {"type": "date"} + params = {"type": "date", "allow_blank": self.empty_ok} if (self.before is not None and self.after is not None): params["formula1"] = parser.parse(self.after).strftime("%Y/%m/%d") params["formula2"] = parser.parse(self.before).strftime("%Y/%m/%d") @@ -462,6 +499,9 @@ def generate(self, column, additional_column=None, additional_worksheet=None): elif self.after is not None: params["formula1"] = parser.parse(self.after).strftime("%Y/%m/%d") params["operator"] = "greaterThanOrEqual" + else: + params["formula1"] = "01/01/1900" + params["operator"] = "greaterThanOrEqual" dv = DataValidation(**params) dv.add("{}2:{}1048576".format(column, column)) @@ -480,6 +520,8 @@ def describe(self, column_name): if not self.empty_ok: text += " (required)" + if self.unique: + text += " (unique)" return text @@ -527,6 +569,11 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException("Value {} is not after {}".format(field, self.after)) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) + except parser.ParserError as e: self.invalid_dict["invalid_set"].add(field) self.invalid_dict["invalid_rows"].add(row_number) @@ -539,7 +586,7 @@ def bad(self): def generate(self, column, additional_column=None, additional_worksheet=None): # GreaterThanOrEqual for validity with ODS. - params = {"type": "time"} + params = {"type": "time", "allow_blank": self.empty_ok} if (self.before is not None and self.after is not None): params["formula1"] = parser.parse(self.after).strftime("%H:%M:%S") params["formula2"] = parser.parse(self.before).strftime("%H:%M:%S") @@ -568,6 +615,8 @@ def describe(self, column_name): if not self.empty_ok: text += " (required)" + if self.unique: + text += " (unique)" return text @@ -591,13 +640,17 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_set"].add(field) self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException(e) + if self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) @property def bad(self): return self.invalid_dict def generate(self, column, ontology_column=None): - params = {"type": "custom"} + params = {"type": "custom", "allow_blank": self.empty_ok} params["formula1"] = '=ISNUMBER(MATCH("*@*.?*",{}2,0))'.format(column) dv = DataValidation(**params) dv.error = 'Value must be an email' @@ -607,7 +660,7 @@ def generate(self, column, ontology_column=None): def describe(self, column_name): if self.readme: column_name += " ({})".format(self.readme) - return "{} : Email {}".format(column_name, "(required)" if not self.empty_ok else "") + return "{} : Email {}{}".format(column_name, "(required)" if not self.empty_ok else "", "(unique)" if self.unique else "") class OntologyValidator(Validator): @@ -647,6 +700,10 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException("{} is not an ontological term".format(field)) self.validated_terms.add(field) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) @property def bad(self): @@ -661,7 +718,7 @@ def generate(self, column, additional_column, additional_worksheet): additional_worksheet.cell(column=column_index_from_string(additional_column), row=row, value=term) row += 1 - params = {"type": "list"} + params = {"type": "list", "allow_blank": self.empty_ok} params["formula1"] = "{}!${}$2:${}${}".format(quote_sheetname(additional_worksheet.title), additional_column, additional_column, row - 1) dv = DataValidation(**params) dv.error = 'Value must be an ontological term' @@ -676,6 +733,8 @@ def describe(self, column_name): text += " Root term is : {}".format(self.root_term) if not self.empty_ok: text += " (required)" + if self.unique: + text += " (unique)" return text def _validate_ontological_term(self, term, return_uri=False): @@ -788,7 +847,7 @@ def generate(self, column, column_dict): if self.unique_with and not all([val in column_dict for val in self.unique_with]): raise BadValidatorException("Using unique_with, but the related column was not defined before") - params = {"type": "custom"} + params = {"type": "custom", "allow_blank": self.empty_ok} internal_value = "${0}:${0},{0}2".format(column) if self.unique_with: for col in self.unique_with: @@ -854,6 +913,11 @@ def validate(self, field, row_number, row): raise ValidationException("{} is not an ontological term".format(field)) self.validated_terms.add(field) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) + @property def bad(self): return self.invalid_dict @@ -882,7 +946,7 @@ def generate(self, column, additional_column, additional_worksheet): additional_worksheet.cell(column=column_index_from_string(additional_column), row=row, value=term) row += 1 - params = {"type": "list"} + params = {"type": "list", "allow_blank": self.empty_ok} params["formula1"] = "{}!${}$2:${}${}".format(quote_sheetname(additional_worksheet.title), additional_column, additional_column, row - 1) dv = DataValidation(**params) dv.error = 'Value must be from Vocabulaires ouverts' @@ -897,6 +961,8 @@ def describe(self, column_name): text += " Root term is : {}".format(self.root_term) if not self.empty_ok: text += " (required)" + if self.unique: + text += " (unique)" return text def _validate_vo_term(self, field, return_uri=False): @@ -976,6 +1042,11 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException("{} does not match regex {}".format(field, self.regex)) + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) + @property def bad(self): return self.invalid_dict @@ -983,6 +1054,14 @@ def bad(self): def generate(self, column): # Difficult to use regex in Excel without a VBA macro if not self.excel_formula: + if self.unique: + params = {"type": "custom", "allow_blank": self.empty_ok} + internal_value = "${0}:${0},{0}2".format(column) + params["formula1"] = '=COUNTIF({})<2'.format(internal_value) + dv = DataValidation(**params) + dv.error = 'Value must be unique' + dv.add("{}2:{}1048576".format(column, column)) + return dv self.logger.warning( "Warning: RegexValidator does not generate a validated column" ) @@ -1001,6 +1080,8 @@ def describe(self, column_name): text = "{} : Term matching the regex {}.".format(column_name, self.regex) if not self.empty_ok: text += " (required)" + if self.unique: + text += " (unique)" return text @@ -1050,6 +1131,10 @@ def validate(self, field, row_number, row): self.invalid_dict["invalid_set"].add(field) self.invalid_dict["invalid_rows"].add(row_number) raise ValidationException("{} is not a valid GPS coordinate") + if field and self.unique: + if field in self.unique_values: + raise ValidationException("'{}' is already in the column".format(field)) + self.unique_values.add(field) @property def bad(self): @@ -1057,6 +1142,14 @@ def bad(self): def generate(self, column): # Difficult to use regex in Excel without a VBA macro + if self.unique: + params = {"type": "custom", "allow_blank": self.empty_ok} + internal_value = "${0}:${0},{0}2".format(column) + params["formula1"] = '=COUNTIF({})<2'.format(internal_value) + dv = DataValidation(**params) + dv.error = 'Value must be unique' + dv.add("{}2:{}1048576".format(column, column)) + return dv self.logger.warning( "Warning: GPSValidator does not generate a validated column" ) @@ -1068,4 +1161,6 @@ def describe(self, column_name): text = "{} : GPS coordinate".format(column_name) if not self.empty_ok: text += " (required)" + if self.unique: + text += " (unique)" return text diff --git a/tests/test_validate.py b/tests/test_validate.py index 89a0b7a..68a1fa3 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,7 +1,30 @@ import pandas as pd from checkcel import Checkcel -from checkcel.validators import TextValidator, DateValidator, UniqueValidator, SetValidator, LinkedSetValidator, IntValidator, FloatValidator, GPSValidator, EmailValidator, TimeValidator, NoValidator +from checkcel.validators import TextValidator, DateValidator, UniqueValidator, SetValidator, LinkedSetValidator, IntValidator, FloatValidator, GPSValidator, EmailValidator, TimeValidator, NoValidator, RegexValidator + + +class TestCheckcelClass(): + + def test_invalid_rows_below(self): + data = {'my_column': ['myvalue', 'my_value2']} + validators = {'my_column': TextValidator()} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, expected_rows=1, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.logs) == 2 + assert validation.logs[1] == "Error: Length issue: Expecting 1 row(s), found 2" + + def test_invalid_rows_above(self): + data = {'my_column': ['myvalue']} + validators = {'my_column': TextValidator()} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, expected_rows=2, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.logs) == 2 + assert validation.logs[1] == "Error: Length issue: Expecting 2 row(s), found 1" class TestCheckcelValidateText(): @@ -15,9 +38,18 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': ['myvalue', 'myvalue']} + validators = {'my_column': TextValidator(unique=True)} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_valid_empty(self): - data = {'my_column': ['', 'myvalue']} - validators = {'my_column': TextValidator()} + data = {'my_column': ['', 'myvalue', '']} + validators = {'my_column': TextValidator(unique=True)} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -50,6 +82,15 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': [1, 1]} + validators = {'my_column': FloatValidator(unique=True)} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def invalid_min(self): data = {'my_column': [6, 4]} validators = {'my_column': FloatValidator(min=5)} @@ -78,8 +119,8 @@ def invalid_both(self): assert len(validation.failures['my_column']) == 2 def test_valid_empty(self): - data = {'my_column': ['', 6]} - validators = {'my_column': FloatValidator()} + data = {'my_column': ['', 6, '']} + validators = {'my_column': FloatValidator(unique=True)} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -112,6 +153,15 @@ def test_invalid_float(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': [1, 1]} + validators = {'my_column': IntValidator(unique=True)} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_invalid_empty(self): data = {'my_column': ['', 6]} validators = {'my_column': IntValidator()} @@ -149,8 +199,8 @@ def invalid_both(self): assert len(validation.failures['my_column']) == 2 def test_valid_empty(self): - data = {'my_column': ['', 6]} - validators = {'my_column': IntValidator()} + data = {'my_column': ['', 6, '']} + validators = {'my_column': IntValidator(unique=True)} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -183,9 +233,18 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': ['validemail@emailprovider.com', 'validemail@emailprovider.com']} + validators = {'my_column': EmailValidator(unique=True)} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_valid_empty(self): - data = {'my_column': ['', 'validemail@emailprovider.com']} - validators = {'my_column': EmailValidator()} + data = {'my_column': ['', 'validemail@emailprovider.com', '']} + validators = {'my_column': EmailValidator(unique=True)} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -236,9 +295,18 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': ['01/01/1970', '01/01/1970']} + validators = {'my_column': DateValidator(unique=True)} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_valid_empty(self): - data = {'my_column': ['', '01/01/1970']} - validators = {'my_column': DateValidator()} + data = {'my_column': ['', '01/01/1970', '']} + validators = {'my_column': DateValidator(unique=True)} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -289,9 +357,18 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': ['13h10', '13h10']} + validators = {'my_column': TimeValidator(unique=True)} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_valid_empty(self): - data = {'my_column': ['', '13h10']} - validators = {'my_column': TimeValidator()} + data = {'my_column': ['', '13h10', '']} + validators = {'my_column': TimeValidator(unique=True)} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -375,9 +452,18 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': ['valid_value', 'valid_value']} + validators = {'my_column': SetValidator(unique=True, valid_values=["valid_value"])} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_valid_empty(self): - data = {'my_column': ['', 'valid_value']} - validators = {'my_column': SetValidator(valid_values=["valid_value"])} + data = {'my_column': ['', 'valid_value', '']} + validators = {'my_column': SetValidator(unique=True, valid_values=["valid_value"])} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -416,11 +502,23 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['another_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': ['value_1', 'value_2', 'value2'], "another_column": ["valid_value", "another_valid_value", "another_valid_value"]} + validators = { + 'my_column': SetValidator(valid_values=['value_1', 'value_2']), + 'another_column': LinkedSetValidator(unique=True, linked_column="my_column", valid_values={"value_1": ["valid_value"], "value_2": ["another_valid_value"]}) + } + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['another_column']) == 1 + def test_valid_empty(self): - data = {'my_column': ['value_1', 'value_2', 'value_2'], "another_column": ["valid_value", "another_valid_value", ""]} + data = {'my_column': ['value_1', 'value_2', 'value_2', 'value_2'], "another_column": ["valid_value", "another_valid_value", "", ""]} validators = { 'my_column': SetValidator(valid_values=['value_1', 'value_2']), - 'another_column': LinkedSetValidator(linked_column="my_column", valid_values={"value_1": ["valid_value"], "value_2": ["another_valid_value"]}) + 'another_column': LinkedSetValidator(unique=True, linked_column="my_column", valid_values={"value_1": ["valid_value"], "value_2": ["another_valid_value"]}) } df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) @@ -486,9 +584,18 @@ def test_invalid_empty(self): assert val is False assert len(validation.failures['my_column']) == 1 + def test_invalid_unique(self): + data = {'my_column': ['46.174181N 14.801100E', '46.174181N 14.801100E']} + validators = {'my_column': GPSValidator(unique=True)} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + def test_valid_empty(self): - data = {'my_column': ['', '46.174181N 14.801100E']} - validators = {'my_column': GPSValidator()} + data = {'my_column': ['', '46.174181N 14.801100E', '']} + validators = {'my_column': GPSValidator(unique=True)} df = pd.DataFrame.from_dict(data) val = Checkcel(data=df, empty_ok=True, validators=validators) assert val.validate() @@ -501,6 +608,50 @@ def test_valid(self): assert val.validate() +class TestCheckcelValidateRegex(): + + def test_invalid(self): + data = {'my_column': ['ABC', 'AFX123']} + validators = {'my_column': RegexValidator(regex="AFX.*")} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + + def test_invalid_empty(self): + data = {'my_column': ['', 'AFX123']} + validators = {'my_column': RegexValidator(regex="AFX.*")} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, empty_ok=False, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + + def test_invalid_unique(self): + data = {'my_column': ['AFX123', 'AFX123']} + validators = {'my_column': RegexValidator(unique=True, regex="AFX.*")} + df = pd.DataFrame.from_dict(data) + validation = Checkcel(data=df, validators=validators) + val = validation.validate() + assert val is False + assert len(validation.failures['my_column']) == 1 + + def test_valid_empty(self): + data = {'my_column': ['', 'AFX123', '']} + validators = {'my_column': RegexValidator(unique=True, regex="AFX.*")} + df = pd.DataFrame.from_dict(data) + val = Checkcel(data=df, empty_ok=True, validators=validators) + assert val.validate() + + def test_valid(self): + data = {'my_column': ['AFX123', 'AFX456']} + validators = {'my_column': RegexValidator(regex="AFX.*")} + df = pd.DataFrame.from_dict(data) + val = Checkcel(data=df, validators=validators) + assert val.validate() + + class TestCheckcelValidateEmpty_if(): def test_invalid_string(self):