Various (#23)

genouest · Nov 29, 2022 · 1add948 · 1add948
1 parent 89a51df
commit 1add948
Show file tree

Hide file tree

Showing 7 changed files with 380 additions and 75 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,3 +14,14 @@ This changelog was started for release 0.0.3.
 - empty_ok_if key for validator
 - empty_ok_unless key for validator
 - readme key for validator
+- unique key for validator
+- expected_rows key for templates
+- logs parameters for templates
+
+### Fixed
+
+- Bug for setValidator when using number values
+
+### Changed
+
+- Better validation for integers
diff --git a/README.md b/README.md
@@ -64,6 +64,8 @@ Invalid fields: [''] in rows: [3]
 IntValidator failed 5 time(s) (100.0%) on field: 'Pierraille surface (25)'
 ```
 
+When calling validate() (from python), you can access a list of logs with the 'logs' parameter of the Checkcel/Checkxtractor/Checkerator class
+
 # Python library
 
 ```python
@@ -104,20 +106,13 @@ Validation templates can use three formats: json/yaml, and python files.
 In all cases, you will need to at least include a list of validators and associated column names. Several optional parameters are also available :
 
 * *metadata*: A list of column names. This will create a metadata sheet with these columns, without validation on them
+* *expected_rows*: (Default 0): Number of *data* rows expected
 * *empty_ok* (Default False): Whether to accept empty values as valid
-* *empty_ok_if* (Default None): Accept empty value as valid if **another column** value is set
-    * Accept either a string (column name), a list (list of column names), or a dict
-      * The dict keys must be column names, and the values lists of 'accepted values'. The current column will accept empty values if the related column's value is in the list of accepted values
-* *empty_ok_unless* (Default None): Accept empty value as valid *unless* **another column** value is set
-    * Accept either a string (column name), a list (list of column names), or a dict
-      * The dict keys must be column names, and the values lists of 'rejected values'. The current column will accept empty values if the related column's value is **not** in the list of reject values
 * *ignore_space* (Default False): whether to trim the values for spaces before checking validity
 * *ignore_case* (Default False): whether to ignore the case
-* *readme* (Default None): Additional information to include on the readme page
 
 The last 3 parameters will affect all the validators (when relevant), but can be overriden at the validator level (eg, you can set 'empty_ok' to True for all, but set it to False for a specific validator).
 
-
 ## Python format
 
 A template needs to contain a class inheriting the Checkplate class.  
@@ -147,13 +142,29 @@ If needed, these dictionnaries can include an 'options' key, containing a dictio
 
 ## Validators
 
-All validators (except NoValidator) have the 'empty_ok' option, which will consider empty values as valid.
-*As in-file validation for non-empty values is unreliable, the non-emptyness is not checked in-file*
+### Global options
+
+All validators (except NoValidator) have these options available. If relevant, these options will override the ones set at the template-level
+
+* *empty_ok* (Default False): Whether to accept empty values as valid (Not enforced in excel)
+* *empty_ok_if* (Default None): Accept empty value as valid if **another column** value is set
+    * Accept either a string (column name), a list (list of column names), or a dict (Not enforced in excel)
+      * The dict keys must be column names, and the values lists of 'accepted values'. The current column will accept empty values if the related column's value is in the list of accepted values
+* *empty_ok_unless* (Default None): Accept empty value as valid *unless* **another column** value is set. (Not enforced in excel)
+    * Accept either a string (column name), a list (list of column names), or a dict
+      * The dict keys must be column names, and the values lists of 'rejected values'. The current column will accept empty values if the related column's value is **not** in the list of reject values
+* *ignore_space* (Default False): whether to trim the values for spaces before checking validity
+* *ignore_case* (Default False): whether to ignore the case
+* *unique* (Default False): whether to enforce unicity for this column. (Not enforced in excel yet, except if there are not other validation (ie TextValidator and RegexValidator in some cases))
+
+*As excel validation for non-empty values is unreliable, the non-emptiness cannot be properly enforced in excel files*
+
+### Validator-specific options
 
 * NoValidator (always True)
   * **No in-file validation generated**
 * TextValidator(empty_ok=False)
-  * **No in-file validation generated**
+  * **No in-file validation generated** (unless *unique* is set)
 * IntValidator(min="", max="", empty_ok=False)
   * Validate that a value is an integer
   * *min*: Minimal value allowed

diff --git a/checkcel/checkcel.py b/checkcel/checkcel.py
@@ -41,16 +41,16 @@ def __init__(
 
     def _log_debug_failures(self):
         for field_name, field_failure in self.failures.items():
-            self.logger.debug('\nFailure on field: "{}":'.format(field_name))
+            self.debug('\nFailure on field: "{}":'.format(field_name))
             for i, (row, errors) in enumerate(field_failure.items()):
-                self.logger.debug("  {}:{}".format(self.source, row))
+                self.debug("  {}:{}".format(self.source, row))
                 for error in errors:
-                    self.logger.debug("    {}".format(error))
+                    self.debug("    {}".format(error))
 
     def _log_validator_failures(self):
         for field_name, validator in self.validators.items():
             if validator.bad:
-                self.logger.error(
+                self.error(
                     "  {} failed {} time(s) ({:.1%}) on field: '{}'".format(
                         validator.__class__.__name__,
                         validator.fail_count,
@@ -64,22 +64,22 @@ def _log_validator_failures(self):
                     data = validator.bad
                     wrong_terms = ", ".join(["'{}'".format(val) for val in data["invalid_set"]])
                     wrong_rows = ", ".join([str(val) for val in data["invalid_rows"]])
-                    self.logger.error(
+                    self.error(
                         "    Invalid fields: [{}] in rows: [{}]".format(wrong_terms, wrong_rows)
                     )
                 except TypeError as e:
                     raise e
 
     def _log_missing_validators(self):
-        self.logger.error("  Missing validators for:")
+        self.error("  Missing validators for:")
         self._log_missing(self.missing_validators)
 
     def _log_missing_fields(self):
-        self.logger.error("  Missing expected fields:")
+        self.error("  Missing expected fields:")
         self._log_missing(self.missing_fields)
 
     def _log_missing(self, missing_items):
-        self.logger.error(
+        self.error(
             "{}".format(
                 "\n".join(
                     ["    '{}': [],".format(field) for field in sorted(missing_items)]
@@ -88,7 +88,7 @@ def _log_missing(self, missing_items):
         )
 
     def validate(self):
-        self.logger.info(
+        self.info(
             "\nValidating {}{}".format(self.__class__.__name__, "(source={})".format(self.source) if self.source else "")
         )
 
@@ -101,7 +101,7 @@ def validate(self):
                 df = pandas.read_csv(self.source, sep=self.delimiter, skiprows=self.row)
 
             if len(df) == 0:
-                self.logger.info(
+                self.info(
                     "\033[1;33m" + "Source file has no data" + "\033[0m"
                 )
                 return False
@@ -115,28 +115,33 @@ def validate(self):
         validator_set = set(self.validators)
         self.missing_validators = self.column_set - validator_set
         if self.missing_validators:
-            self.logger.info("\033[1;33m" + "Missing..." + "\033[0m")
+            self.info("\033[1;33m" + "Missing..." + "\033[0m")
             self._log_missing_validators()
 
             if not self.ignore_missing_validators:
                 return False
 
         self.missing_fields = validator_set - self.column_set
         if self.missing_fields:
-            self.logger.info("\033[1;33m" + "Missing..." + "\033[0m")
+            self.info("\033[1;33m" + "Missing..." + "\033[0m")
             self._log_missing_fields()
             return False
 
+        if self.expected_rows:
+            if not self.expected_rows == len(df.index):
+                self.error("Length issue: Expecting {} row(s), found {}".format(self.expected_rows, len(df.index)))
+                return False
+
         # Might be a way to do it more efficiently..
         df.apply(lambda row: self._validate(row), axis=1)
 
         if self.failures:
-            self.logger.info("\033[0;31m" + "Failed" + "\033[0m")
+            self.info("\033[0;31m" + "Failed" + "\033[0m")
             self._log_debug_failures()
             self._log_validator_failures()
             return False
         else:
-            self.logger.info("\033[0;32m" + "Passed" + "\033[0m")
+            self.info("\033[0;32m" + "Passed" + "\033[0m")
             return True
 
     def _validate(self, row):

diff --git a/checkcel/checkplate.py b/checkcel/checkplate.py
@@ -15,18 +15,36 @@
 
 class Checkplate(object):
     """ Base class for templates """
-    def __init__(self, validators={}, empty_ok=False, ignore_case=False, ignore_space=False, metadata=[]):
+    def __init__(self, validators={}, empty_ok=False, ignore_case=False, ignore_space=False, metadata=[], expected_rows=None):
         self.metadata = metadata
         self.logger = logs.logger
         self.validators = validators or getattr(self, "validators", {})
+        self.logs = []
         # Will be overriden by validators config
         self.empty_ok = empty_ok
         self.ignore_case = ignore_case
         self.ignore_space = ignore_space
+        self.expected_rows = expected_rows
         # self.trim_values = False
         for validator in self.validators.values():
             validator._set_attributes(self.empty_ok, self.ignore_case, self.ignore_space)
 
+    def debug(self, message):
+        self.logger.debug(message)
+        self.logs.append("Debug: {}".format(message))
+
+    def info(self, message):
+        self.logger.info(message)
+        self.logs.append("Info: {}".format(message))
+
+    def warn(self, message):
+        self.logger.warn(message)
+        self.logs.append("Warning: {}".format(message))
+
+    def error(self, message):
+        self.logger.error(message)
+        self.logs.append("Error: {}".format(message))
+
     def load_from_python_file(self, file_path):
         # Limit conflicts in file name
         with tempfile.TemporaryDirectory() as dirpath:
@@ -44,7 +62,7 @@ def load_from_python_file(self, file_path):
                 custom_class = list(filtered_classes.values())[0]
 
         if not custom_class:
-            self.logger.error(
+            self.error(
                 "Could not find a subclass of Checkplate in the provided file."
             )
             return exits.UNAVAILABLE
@@ -53,13 +71,21 @@ def load_from_python_file(self, file_path):
         self.empty_ok = getattr(custom_class, 'empty_ok', False)
         self.ignore_case = getattr(custom_class, 'ignore_case', False)
         self.ignore_space = getattr(custom_class, 'ignore_space', False)
+        self.expected_rows = getattr(custom_class, 'expected_rows', 0)
+        try:
+            self.expected_rows = int(self.expected_rows)
+        except ValueError:
+            self.error(
+                "Malformed Checkcel template: expected_rows is not an integer"
+            )
+
         for key, validator in self.validators.items():
             validator._set_attributes(self.empty_ok, self.ignore_case, self.ignore_space)
         return self
 
     def load_from_json_file(self, file_path):
         if not os.path.isfile(file_path):
-            self.logger.error(
+            self.error(
                 "Could not find a file at path {}".format(file_path)
             )
             return exits.NOINPUT
@@ -71,7 +97,7 @@ def load_from_json_file(self, file_path):
 
     def load_from_yaml_file(self, file_path):
         if not os.path.isfile(file_path):
-            self.logger.error(
+            self.error(
                 "Could not find a file at path {}".format(file_path)
             )
             return exits.NOINPUT
@@ -80,7 +106,7 @@ def load_from_yaml_file(self, file_path):
             try:
                 data = yaml.safe_load(f)
             except yaml.YAMLError:
-                self.logger.error(
+                self.error(
                     "File {} is not a valid yaml file".format(file_path)
                 )
                 return exits.UNAVAILABLE
@@ -104,21 +130,29 @@ def _is_valid_template(self, tup):
 
     def _load_from_dict(self, data):
         if 'validators' not in data or not isinstance(data['validators'], list):
-            self.logger.error(
+            self.error(
                 "Could not find a list of validators in data"
             )
             return exits.UNAVAILABLE
 
         self.empty_ok = data.get("empty_ok", False)
         self.ignore_case = data.get('ignore_case', False)
         self.ignore_space = data.get('ignore_space', False)
+        self.expected_rows = data.get('expected_rows', 0)
+        try:
+            self.expected_rows = int(self.expected_rows)
+        except ValueError:
+            self.error(
+                "Malformed Checkcel template: expected_rows is not an integer"
+            )
+
         validators_list = []
         self.validators = {}
         self.metadata = data.get('metadata', [])
 
         for validator in data['validators']:
             if 'type' not in validator or 'name' not in validator:
-                self.logger.error(
+                self.error(
                     "Malformed Checkcel Validator. Require both 'type' and 'name' key"
                 )
                 return exits.UNAVAILABLE
@@ -129,7 +163,7 @@ def _load_from_dict(self, data):
                 val = validator_class(**options)
                 val._set_attributes(self.empty_ok, self.ignore_case, self.ignore_space)
             except AttributeError:
-                self.logger.error(
+                self.error(
                     "{} is not a valid Checkcel Validator".format(validator['type'])
                 )
                 return exits.UNAVAILABLE

diff --git a/checkcel/checkxtractor.py b/checkcel/checkxtractor.py
@@ -1,4 +1,3 @@
-from checkcel import logs
 from openpyxl import load_workbook
 from openpyxl.worksheet.cell_range import CellRange
 from openpyxl.utils import get_column_letter
@@ -10,7 +9,6 @@
 class Checkxtractor(object):
     """ Extract validation value from xlsx file (only) """
     def __init__(self, source, output, sheet=0, row=0, template_type="python"):
-        self.logger = logs.logger
         self.source = source
         self.output = output
         self.sheet = int(sheet)