From 6f199101aa58e7b263c7b75f1ce75eb8261e455a Mon Sep 17 00:00:00 2001 From: yulric Date: Fri, 19 May 2023 08:51:35 -0400 Subject: [PATCH 1/3] Fixed broken links in the CONTRIBUTING.md file --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e188667e..49bac9e4 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ The PHES-ODM validation tool kit is an open source and community-driven. You can ## Adding a new rule -New validation rules can be requested by anyone ODM user. Instructions on how to add a new rule is found in [/docs/validation-rules/README.md](/docs/validation-rules/README.md). The validation rules README.md is a good source of additional information about how rules work. +New validation rules can be requested by anyone ODM user. Instructions on how to add a new rule can be found in the [documentation](/rules.html#adding-a-new-rule). The validation rules [README.md](/rules.html) is a good source of additional information about how rules work. ## Code style From 7cf5c8a6af178fe29e045152d07216f6c6dd4816 Mon Sep 17 00:00:00 2001 From: yulric Date: Fri, 19 May 2023 08:53:26 -0400 Subject: [PATCH 2/3] Formatting --- CONTRIBUTING.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 49bac9e4..a629aa87 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,10 +1,17 @@ # Contributing -The PHES-ODM validation tool kit is an open source and community-driven. You can make suggestions for new validation rules or comment on existing rules on the PHES-ODM [discussion board](https://odm.discourse.group) or GitHub [Issues](https://github.com/Big-Life-Lab/PHES-ODM-Validation/issues). +The PHES-ODM validation tool kit is an open source and community-driven. You can +make suggestions for new validation rules or comment on existing rules on the +PHES-ODM [discussion board](https://odm.discourse.group) or GitHub +[Issues](https://github.com/Big-Life-Lab/PHES-ODM-Validation/issues). ## Adding a new rule -New validation rules can be requested by anyone ODM user. Instructions on how to add a new rule can be found in the [documentation](/rules.html#adding-a-new-rule). The validation rules [README.md](/rules.html) is a good source of additional information about how rules work. +New validation rules can be requested by anyone ODM user. Instructions on how to +add a new rule can be found in the +[documentation](/rules.html#adding-a-new-rule). The validation rules +[README.md](/rules.html) is a good source of additional information about +how rules work. ## Code style From b3098bced065c0bfaf1562d78b41b40e62a90d1b Mon Sep 17 00:00:00 2001 From: Yulric Sequeira Date: Thu, 8 Dec 2022 10:36:35 -0500 Subject: [PATCH 3/3] Added new invalid_email rule spec * docs/validation-rules/invalid_email.qmd: Documentation for the new rule. * docs/validation-rules/invalid_type.qmd: Removed all references to validating email as a type since the new rule handles that. * assets/validation-rules/validation-rules-list.csv: Added the metadata for this rule. * assets/validation-rules/invalid-email/version-1-email-columns.yaml: Hardcoded list of email columns for version 1 of the ODM. The implementation should use this. * assets/validation-rules/invalid-email/version-2-email-columns.yaml: Hardcodded list of email column for version 2 of the ODM. The implementation should use this. * assets/**/*: Testing files used by the documentation. --- .../invalid-email/error-report-1.json | 18 ++ .../invalid-email/invalid-dataset-1.csv | 2 + .../validation-rules/invalid-email/parts.csv | 5 + .../invalid-email/schema-v1.yaml | 22 +++ .../invalid-email/schema-v2.yaml | 17 ++ .../invalid-email/valid-dataset-1.csv | 2 + .../version-1-email-columns.yaml | 2 + .../version-2-email-columns.yaml | 2 + .../validation-rules-list.csv | 1 + docs/validation-rules/invalid_email.qmd | 91 +++++++++ docs/validation-rules/invalid_type.qmd | 182 ------------------ 11 files changed, 162 insertions(+), 182 deletions(-) create mode 100644 assets/validation-rules/invalid-email/error-report-1.json create mode 100644 assets/validation-rules/invalid-email/invalid-dataset-1.csv create mode 100644 assets/validation-rules/invalid-email/parts.csv create mode 100644 assets/validation-rules/invalid-email/schema-v1.yaml create mode 100644 assets/validation-rules/invalid-email/schema-v2.yaml create mode 100644 assets/validation-rules/invalid-email/valid-dataset-1.csv create mode 100644 assets/validation-rules/invalid-email/version-1-email-columns.yaml create mode 100644 assets/validation-rules/invalid-email/version-2-email-columns.yaml create mode 100644 docs/validation-rules/invalid_email.qmd diff --git a/assets/validation-rules/invalid-email/error-report-1.json b/assets/validation-rules/invalid-email/error-report-1.json new file mode 100644 index 00000000..13cace15 --- /dev/null +++ b/assets/validation-rules/invalid-email/error-report-1.json @@ -0,0 +1,18 @@ +{ + "errors": [ + { + "errorType": "invalid_email", + "tableName": "contacts", + "columnName": "email", + "rowNumber": 1, + "row": { + "contactID": "1", + "email": "john.doe" + }, + "invalidValue": "john.doe", + "validationRuleFields": [], + "message": "Invalid email john.doe found in row 1 for column email in table contacts" + } + ], + "warnings": [] +} \ No newline at end of file diff --git a/assets/validation-rules/invalid-email/invalid-dataset-1.csv b/assets/validation-rules/invalid-email/invalid-dataset-1.csv new file mode 100644 index 00000000..99b12dca --- /dev/null +++ b/assets/validation-rules/invalid-email/invalid-dataset-1.csv @@ -0,0 +1,2 @@ +contactID,email +1,john.doe \ No newline at end of file diff --git a/assets/validation-rules/invalid-email/parts.csv b/assets/validation-rules/invalid-email/parts.csv new file mode 100644 index 00000000..0d3fbe69 --- /dev/null +++ b/assets/validation-rules/invalid-email/parts.csv @@ -0,0 +1,5 @@ +partID,partType,sites,contacts,version1Location,version1Table,version1Variable +sites,tables,NA,NA,tables,Site,NA +geoLat,attributes,header,NA,variables,Site,Latitude +contacts,tables,NA,NA,tables,Contact,NA +email,attributes,NA,header,variables,Contact,contactEmail diff --git a/assets/validation-rules/invalid-email/schema-v1.yaml b/assets/validation-rules/invalid-email/schema-v1.yaml new file mode 100644 index 00000000..10645b67 --- /dev/null +++ b/assets/validation-rules/invalid-email/schema-v1.yaml @@ -0,0 +1,22 @@ +schemaVersion: '1.0.0' +schema: + Site: + type: list + schema: + type: dict + contactEmail: + is_email: true + meta: + - ruleID: invalid_email + meta: + - partID: email + partType: attributes + contacts: header + version1Location: variables + version1Table: Site + version1Variable: contactEmail + meta: + - partID: sites + partType: tables + version1Location: tables + version1Table: Site \ No newline at end of file diff --git a/assets/validation-rules/invalid-email/schema-v2.yaml b/assets/validation-rules/invalid-email/schema-v2.yaml new file mode 100644 index 00000000..61b85cf9 --- /dev/null +++ b/assets/validation-rules/invalid-email/schema-v2.yaml @@ -0,0 +1,17 @@ +schemaVersion: '2.0.0' +schema: + sites: + type: list + schema: + type: dict + email: + is_email: true + meta: + - ruleID: invalid_email + meta: + - partID: email + partType: attributes + contacts: header + meta: + - partID: sites + partType: tables \ No newline at end of file diff --git a/assets/validation-rules/invalid-email/valid-dataset-1.csv b/assets/validation-rules/invalid-email/valid-dataset-1.csv new file mode 100644 index 00000000..04144a35 --- /dev/null +++ b/assets/validation-rules/invalid-email/valid-dataset-1.csv @@ -0,0 +1,2 @@ +contactID,email +1,john.doe@email.com \ No newline at end of file diff --git a/assets/validation-rules/invalid-email/version-1-email-columns.yaml b/assets/validation-rules/invalid-email/version-1-email-columns.yaml new file mode 100644 index 00000000..97323357 --- /dev/null +++ b/assets/validation-rules/invalid-email/version-1-email-columns.yaml @@ -0,0 +1,2 @@ +- partID: contactEmail + table: Lab diff --git a/assets/validation-rules/invalid-email/version-2-email-columns.yaml b/assets/validation-rules/invalid-email/version-2-email-columns.yaml new file mode 100644 index 00000000..19705436 --- /dev/null +++ b/assets/validation-rules/invalid-email/version-2-email-columns.yaml @@ -0,0 +1,2 @@ +- partID: email + table: contacts diff --git a/assets/validation-rules/validation-rules-list.csv b/assets/validation-rules/validation-rules-list.csv index b0f2985a..7cedc812 100644 --- a/assets/validation-rules/validation-rules-list.csv +++ b/assets/validation-rules/validation-rules-list.csv @@ -11,3 +11,4 @@ less_than_min_length,Validates the minimum length of a string type,The minLength greater_than_max_length,Validates the maximum length of a string type,The maxLength column in the ODM dictionary documents the minimum that a part should have. This validation implements it.,error,Value in row in column in table has length which is greater than the max length of ,active,v1.0.0,,all,, invalid_type,Validates type of a value,Uses the dataType column to check if a value is the correct type or can be coerced into the correct type,error,Value in row in column in table has type but should be of type or coercable into a .,,,,,, invalid_type,Validates type of a value,Uses the dataType column to check if a value is the correct type or can be coerced into the correct type,error,Row in column in table is a boolean but has value . Allowed values are ,,,,,, +invalid_email,Validates an email column,The dictionary does not contain any metadata to describe if a column is an email or not. This rule hardcodes the email column within it.,error,Invalid email found in row for column in table ,active,v1.0.0,,all,, diff --git a/docs/validation-rules/invalid_email.qmd b/docs/validation-rules/invalid_email.qmd new file mode 100644 index 00000000..8954ee3d --- /dev/null +++ b/docs/validation-rules/invalid_email.qmd @@ -0,0 +1,91 @@ +# invalid_email + +{{< include _setup.qmd >}} + +```{python} +#| echo: false +ASSET_DIR = get_rule_asset_dir('invalid_email') +``` + +This rule validates that varchar columns that represent an email have a valid email address. For example, consider the `email` column in the `contacts` table. The following dataset snippet should fail validation, + +```{python} +pprint_csv_file(asset("invalid-dataset-1.csv"), "Invalid dataset") +``` + +whereas the following should pass, + +```{python} +pprint_csv_file(asset("valid-dataset-1.csv"), "Valid dataset") +``` + +## Error report + +The error report will have the following fields + +* **errorType**: invalid_email +* **tableName**: The name of the table whose row has the invalid email +* **columnName** The name of the column with the invalid email +* **rowNumber**: The index of the table row with the error +* **row** The row in the data that failed this validation rule +* **invalidValue**: The invalid email value +* **validationRuleFields**: The ODM data dictionary rule fields violated by this row +* **message**: Invalid email found in row for column in table + +An example error report for the invalid dataset above is shown below, + +```{python} +pprint_json_file(asset("error-report-1.json")) +``` + +## Rule metadata + +The dictionary currently does not have any metadata to say if a column is an email or not. Instead we will be hardcoding this rule to a set of pre-determined email columns. For version 2 the email columns are: + +```{python} +pprint_yaml_file(asset("version-2-email-columns.yaml")) +``` + +In the above file, + +* The `partID` field contains the name of the email column and +* The `table` field contains the name of the table that the part is a column in. + +If a parts sheet contains any of the above mentioned columns, then this validation rule should be added to them. For example, in the following parts sheet snippet this rule should be added to all columns except for `geoLat`. + +```{python} +pprint_csv_file(asset("parts.csv"), title = "Parts v2", ignore_prefix = "version1") +``` + +## Cerberus schema + +We will be using a custom rule called `is_email` to each column. Alternative appraoches and reasons for not using them are: + +1. `type` rule: We would prefer to keep the value of this rule the same as the `dataType` column in the ODM +2. `regex` rule: Better than type but is less clear to a user of the schema what the regex is actually trying to validate. + +Underneath the hood the `is_email` rule will be using a regex to validate the column value. An example of the regex can be seen in this [stack overflow thread](https://stackoverflow.com/a/201378/1950599). For the parts snippet above the following schema should be generated, + +```{python} +pprint_yaml_file(asset("schema-v2.yaml")) +``` + +## Version 1 + +For version 1 schemas, we add this rule to the version 1 equivalents of the above mentioned version 2 email columns. In addition, this rule should also be added to the following version 1 only columns: + +```{python} +pprint_yaml_file(asset("version-1-email-columns.yaml")) +``` + +For example, for the following version 1 parts snippet, + +```{python} +pprint_csv_file(asset("parts.csv"), title = "Parts v1") +``` + +the following validation schema should be generated, + +```{python} +pprint_yaml_file(asset("schema-v1.yaml")) +``` \ No newline at end of file diff --git a/docs/validation-rules/invalid_type.qmd b/docs/validation-rules/invalid_type.qmd index 02423f43..05eed6dc 100644 --- a/docs/validation-rules/invalid_type.qmd +++ b/docs/validation-rules/invalid_type.qmd @@ -79,36 +79,6 @@ following snippet below should pass validation, pprint_csv_file(asset("float-valid-dataset-2.csv"), "Valid float dataset") ``` -## email - -An email address. As an example, imagine a part with id `contactEmail` which is -a header in a `labs` table and has its defined type to be `email`. The -following ODM snippet would fail validation, - -```python -{ - "labs": [ - { - "labID": "1", - "email": "john.doe" - } - ] -} -``` - -whereas the following should pass validation - -```python -{ - "labs": [ - { - "labID": "1", - "email": "john.doe@email.com" - } - ] -} -``` - ## boolean A column that can have one of two values, representing Yes/No. The category @@ -232,34 +202,6 @@ float pprint_json_file(asset("float-error-report-1.json")) ``` -email - -```python -[ - { - "errorType": "invalid_type", - "tableName": "labs", - "columnName": "contactEmail", - "rowNumber": 1, - "row": { - "labID": "1", - "contactEmail": "john.doe" - }, - "invalidValue": "john.doe", - "validationRuleFields": [ - { - "partID": "contactEmail", - "dataType": "email", - "labs": "header" - } - ], - "message": "Value john.doe in row 1 in column contactEmail in table - labs has type varchar but should be of type email or - coercable into an email" - } -] -``` - boolean 1 ```{python} @@ -314,27 +256,6 @@ pprint_csv_file(asset("float-parts.csv"), "Float parts v2", ignore_prefix="version1") ``` -For the email `contactEmail` column in the labs table - -```python -{ - "parts": [ - { - "partID": "labs", - "partType": "tables", - "labs": "NA", - "dataType": "NA" - }, - { - "partID": "contactEmail", - "partType": "attribute", - "labs": "header", - "dataType": "email" - } - ] -} -``` - For the boolean `reportable` column in the `measures` table, the part sheet is below, @@ -368,9 +289,6 @@ by using the [coerce](https://docs.python-cerberus.org/en/stable/normalization-rules.html#value-coercion) field. -For an `email` type we will need to create a [custom type](https://docs.python-cerberus.org/en/stable/customize.html#new-types) -in cerberus. - For a `boolean` type we will need to use the [allowed](https://docs.python-cerberus.org/en/stable/validation-rules.html#allowed) rule in cerberus. @@ -393,40 +311,6 @@ float pprint_yaml_file(asset("float-schema-v2.yml")) ``` -email - -```python -{ - "labs": { - "type": "list", - "schema": { - "type": "dict", - "schema": { - "contactEmail": { - "type": "email" - "meta": [ - { - "ruleId": "invalid_type", - "meta": [ - { - "partID": "contactEmail", - "dataType": "email", - "labs": "header" - } - ] - } - ] - } - } - }, - "meta": { - "partID": "labs", - "partType": "table" - } - } -} -``` - boolean ```{python} @@ -470,33 +354,6 @@ pprint_csv_file(asset("float-parts.csv"), "Float parts v1", ignore_prefix="version1") ``` -email - -```python -{ - "parts": [ - { - "partID": "labs", - "partType": "tables", - "labs": "NA", - "dataType": "NA", - "version1Location": "tables", - "version1Table": "Lab", - "version1Variable": "NA" - }, - { - "partID": "contactEmail", - "partType": "attribute", - "labs": "header", - "dataType": "email", - "version1Location": "variables", - "version1Table": "Lab", - "version1Variable": "email" - } - ] -} -``` - boolean ```{python} @@ -528,45 +385,6 @@ float pprint_yaml_file(asset("float-schema-v1.yml")) ``` -email - -```python -{ - "Lab": { - "type": "list", - "schema": { - "type": "dict", - "schema": { - "email": { - "type": "email" - "meta": [ - { - "ruleId": "invalid_type", - "meta": [ - { - "partID": "contactEmail", - "dataType": "email", - "labs": "header", - "version1Location": "variables", - "version1Table": "Lab", - "version1Variable": "email" - } - ] - } - ] - } - } - }, - "meta": { - "partID": "labs", - "partType": "table", - "version1Location": "tables", - "version1Table": "Lab" - } - } -} -``` - boolean ```{python}