Config reader with expected schema validation

CDCgov · zsusswein · Aug 7, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024
commit a7602bb23250462300e3c7969458661aaccfcc67
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,6 @@ repos:
     hooks:
     -   id: style-files
         args: [--style_pkg=styler, --style_fun=tidyverse_style]    
-    -   id: roxygenize
     -   id: use-tidy-description
     -   id: lintr
     -   id: readme-rmd-rendered

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,8 +14,13 @@ Description: Add logging, metadata handling, and data handling
 License: Apache License (>= 2)
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Suggests:
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 URL: https://cdcgov.github.io/cfa-epinow2-pipeline/
+Imports: 
+    cli,
+    jsonlite,
+    jsonvalidate,
+    rlang
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,3 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_two_numbers)
+export(fetch_config)
+export(validate_config)
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,5 @@
 # CFAEpiNow2Pipeline (development version)
 
+* Config reader with schema validation and enforcement
 * CI running on Ubuntu only & working pkgdown deploy to Github Pages
 * Initial R package with checks running in CI
diff --git a/R/config.R b/R/config.R
@@ -0,0 +1,100 @@
+#' Fetch the config from an external resource
+#'
+#' This step is the first part of the modeling pipeline. It looks to Azure Blob
+#' and downloads the Rt model run's config to the local config (if
+#' `blob_storage_container` is specified), reads the config in from the
+#' filesystem, and validates that it matches expectations. If any of these steps
+#' fails, the pipeline fails with an informative error message. Note, however,
+#' that a failure in this initial step suggests that something fundamental is
+#' misspecified and the logs will likely not be preserved in a Blob Container if
+#' running in Azure.
+#'
+#' The validation relies on `inst/data/config_schema.json` for validation. This
+#' file is in `json-schema` notation and generated programatically via
+#' https://www.jsonschema.net/.
+#'
+#' @param config_path The path to the config file, either in the local
+#'   filesystem or with an Azure Blob Storage container. If
+#'   `blob_storage_container` is specified, the the path is assumed to be within
+#'   the specified container otherwise it is assumed to be in the local
+#'   filesystem.
+#' @param local_dest The local directory to write the config to when downloading
+#'   it from `blob_storage_container`. This argument is ignored unless
+#'   `blob_storage_container` is specified.
+#' @param blob_storage_container The storage container holding the config at
+#'   `config_path`
+#' @param config_schema_path The path to the file holding the schema for the
+#'   config json for the validator to use.
+#'
+#' @return A list of lists, the config for the run.
+#' @export
+fetch_config <- function(
+    config_path,
+    local_dest,
+    blob_storage_container,
+    config_schema_path = system.file("extdata/config_schema.json",
+      package = "CFAEpiNow2Pipeline"
+    )) {
+  if (!rlang::is_null(blob_storage_container)) {
+    download_from_azure_blob(
+      config_path,
+      local_dest,
+      container_name = blob_storage_container
+    )
+  } else {
+    cli::cli_alert(
+      "No blob storage container provided. Reading from local path."
+    )
+  }
+
+  cli::cli_alert_info("Loading config from {.path {config_path}}")
+  validate_config(config_path, config_schema_path)
+
+  config <- rlang::try_fetch(
+    jsonlite::read_json(config_path),
+    error = function(con) {
+      cli::cli_abort(
+        "Error loading config from {.path {config_path}}",
+        parent = con,
+        class = "CFA_Rt"
+      )
+    }
+  )
+
+  return(config)
+}
+
+#' Compare loaded json against expectation in `inst/data/config-schema.json`
+#'
+#' @inheritParams fetch_config
+#' @return NULL, invisibly
+#' @export
+validate_config <- function(
+    config_path,
+    config_schema_path = system.file("extdata/config_schema.json",
+      package = "CFAEpiNow2Pipeline"
+    )) {
+  is_config_valid <- rlang::try_fetch(
+    jsonvalidate::json_validate(
+      json = config_path,
+      schema = config_schema_path,
+      engine = "ajv",
+      verbose = TRUE,
+      greedy = TRUE,
+      error = TRUE
+    ),
+    error = function(con) {
+      cli::cli_abort(
+        c(
+          "Error while validating config",
+          "!" = "Config path: {.path {config_path}}",
+          "!" = "Schema path: {.path {config_schema_path}}"
+        ),
+        parent = con,
+        class = "CFA_Rt"
+      )
+    }
+  )
+
+  invisible(is_config_valid)
+}
diff --git a/inst/extdata/config_schema.json b/inst/extdata/config_schema.json
@@ -0,0 +1,191 @@
+{
+    "$schema": "http://json-schema.org/draft-06/schema#",
+    "$ref": "#/definitions/Epinow2",
+    "definitions": {
+        "Epinow2": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "job_id": {
+                    "type": "string",
+                    "format": "uuid"
+                },
+                "task_id": {
+                    "type": "string",
+                    "format": "uuid"
+                },
+                "as_of_date": {
+                    "type": "string",
+                    "format": "date"
+                },
+                "disease": {
+                    "type": "string"
+                },
+                "geo_value": {
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "geo_type": {
+                    "type": "string"
+                },
+                "parameters": {
+                    "$ref": "#/definitions/Parameters"
+                },
+                "data": {
+                    "$ref": "#/definitions/Data"
+                },
+                "seed": {
+                    "type": "integer"
+                },
+                "horizon": {
+                    "type": "integer"
+                },
+                "priors": {
+                    "$ref": "#/definitions/Priors"
+                },
+                "sampler_opts": {
+                    "$ref": "#/definitions/SamplerOpts"
+                }
+            },
+            "required": [
+                "as_of_date",
-                "as_of_date",
+                "as_of_date",
+                "timeseries_end_date",
+                "timeseries_length_weeks",
-                "as_of_date",
+                "as_of_date",
+                "timeseries_end_date",
+                "timeseries_length_weeks",
+                "data",
+                "disease",
+                "geo_type",
+                "geo_value",
+                "horizon",
+                "job_id",
+                "parameters",
+                "priors",
+                "sampler_opts",
+                "seed",
+                "task_id"
+            ],
+            "title": "Epinow2"
+        },
+        "Data": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "path": {
+                    "type": "string"
+                },
+                "blob_storage_container": {
+                    "type": ["null", "string"]
+                },
+                "report_date": {
+                    "type": "array",
+                    "items": {
+                        "type": "string",
+                        "format": "date"
+                    }
+                },
+                "reference_date": {
+                    "type": "array",
+                    "items": {
+                        "type": "string",
+                        "format": "date"
+                    }
+                }
+            },
+            "required": [
+                "blob_storage_container",
+                "path",
+                "reference_date",
+                "report_date"
+            ],
+            "title": "Data"
+        },
+        "Parameters": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "path": {
+                    "type": "string"
+                },
+                "blob_storage_container": {
+                    "type": ["string", "null"]
+                }
+            },
+            "required": [
+                "blob_storage_container",
+                "path"
+            ],
+            "title": "Parameters"
+        },
+        "Priors": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "rt": {
+                    "$ref": "#/definitions/Rt"
+                },
+                "gp": {
+                    "$ref": "#/definitions/Gp"
+                }
+            },
+            "required": [
+                "gp",
+                "rt"
+            ],
+            "title": "Priors"
+        },
+        "Gp": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "alpha_sd": {
+                    "type": "number"
+                }
+            },
+            "required": [
+                "alpha_sd"
+            ],
+            "title": "Gp"
+        },
+        "Rt": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "mean": {
+                    "type": "integer"
+                },
+                "sd": {
+                    "type": "number"
+                }
+            },
+            "required": [
+                "mean",
+                "sd"
+            ],
+            "title": "Rt"
+        },
+        "SamplerOpts": {
+            "type": "object",
+            "additionalProperties": false,
+            "properties": {
+                "cores": {
+                    "type": "integer"
+                },
+                "chains": {
+                    "type": "integer"
+                },
+                "adapt_delta": {
+                    "type": "number"
+                },
+                "max_treedepth": {
+                    "type": "integer"
+                }
+            },
+            "required": [
+                "adapt_delta",
+                "chains",
+                "cores",
+                "max_treedepth"
+            ],
+            "title": "SamplerOpts"
+        }
+    }
+}
diff --git a/man/fetch_config.Rd b/man/fetch_config.Rd
diff --git a/man/validate_config.Rd b/man/validate_config.Rd
diff --git a/tests/testthat/data/bad_sample_config.json b/tests/testthat/data/bad_sample_config.json
@@ -0,0 +1,32 @@
+
+{
+    "as_of_date": "2023-01-01",
+    "geo_value": "test",
+    "geo_type": "test",
+    "report_date": [
+        "01-01"
+    ],
+    "reference_date": [
+        "2023-01-01",
+        "2022-12-30",
+        "2022-12-29"
+    ],
+    "seed": "abc",
+    "horizon": 14,
+    "priors": {
+        "rt": {
+            "mean": 1.0,
+            "sd": 0.2
+        },
+        "gp": {
+            "alpha_sd": 0.01
+        }
+    },
+    "sampler_opts": {
+        "cores": 4,
+        "chains": 4,
+        "adapt_delta": 0.99,
+        "max_treedepth": 12,
+        "not_a_parameter": -12
+    }
+}
diff --git a/tests/testthat/data/sample_config.json b/tests/testthat/data/sample_config.json
@@ -0,0 +1,41 @@
+{
+    "job_id": "6183da58-89bc-455f-8562-4f607257a876",
+    "task_id": "bc0c3eb3-7158-4631-a2a9-86b97357f97e",
+    "as_of_date": "2023-01-01",
+    "disease": "test",
+    "geo_value": ["test"],
+    "geo_type": "test",
+    "parameters": {
+       "path": "data/parameters.parquet",
+       "blob_storage_container": null
+    },
+    "data": {
+        "path": "gold/",
+        "blob_storage_container": null,
+        "report_date": [
+            "2023-01-01"
+        ],
+        "reference_date": [
+        "2023-01-01",
+        "2022-12-30",
+        "2022-12-29"
+        ]
+    },
+    "seed": 42,
+    "horizon": 14,
+    "priors": {
+        "rt": {
+            "mean": 1.0,
+            "sd": 0.2
+        },
+        "gp": {
+            "alpha_sd": 0.01
+        }
+    },
+    "sampler_opts": {
+        "cores": 4,
+        "chains": 4,
+        "adapt_delta": 0.99,
+        "max_treedepth": 12
+    }
+}
diff --git a/tests/testthat/test-fetch_config.R b/tests/testthat/test-fetch_config.R
@@ -0,0 +1,44 @@
+test_that("Test config loads", {
+  config_path <- test_path("data/sample_config.json")
+
+  expected <- jsonlite::read_json(config_path)
+  actual <- fetch_config(
+    config_path = config_path,
+    local_dest = NULL,
+    blob_storage_container = NULL
+  )
+
+  expect_equal(actual, expected)
+})
+
+test_that("Bad config errors", {
+  config_path <- test_path("data/bad_sample_config.json")
+
+  expect_error(
+    {
+      fetch_config(
+        config_path = config_path,
+        local_dest = NULL,
+        blob_storage_container = NULL
+      )
+    },
+    class = "CFA_Rt"
+  )
+})
+
+test_that("Test config validates", {
+  config_path <- test_path("data/sample_config.json")
+
+  expect_true(
+    validate_config(
+      config_path
+    )
+  )
+})
+
+
+test_that("Bad config errors", {
+  config_path <- test_path("data/bad_sample_config.json")
+
+  expect_error(validate_config(config_path))
+})