diff --git a/R/calculateNotApplicableStatus.R b/R/calculateNotApplicableStatus.R new file mode 100644 index 00000000..eeafb203 --- /dev/null +++ b/R/calculateNotApplicableStatus.R @@ -0,0 +1,191 @@ +# Copyright 2023 Observational Health Data Sciences and Informatics +# +# This file is part of DataQualityDashboard +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Determines if all checks are present expected to calculate the 'Not Applicable' status +#' +#' @param checkResults A dataframe containing the results of the data quality checks +#' +#' @keywords internal +.hasNAchecks <- function(checkResults) { + checkNames <- unique(checkResults$checkName) + return(.containsNAchecks(checkNames)) +} + +#' Determines if all checks required for 'Not Applicable' status are in the checkNames +#' +#' @param checkNames A character vector of check names +#' +#' @keywords internal +.containsNAchecks <- function(checkNames) { + naCheckNames <- c("cdmTable", "cdmField", "measureValueCompleteness") + missingNAChecks <- !(naCheckNames %in% checkNames) + if (any(missingNAChecks)) { + return(FALSE) + } + return(TRUE) +} + +#' Applies the 'Not Applicable' status to a single check +#' +#' @param x Results from a single check +#' +#' @keywords internal +.applyNotApplicable <- function(x) { + # Errors precede all other statuses + if (x$isError == 1) { + return(0) + } + + # No NA status for cdmTable and cdmField if missing + if (x$checkName == "cdmTable" || x$checkName == "cdmField") { + return(0) + } + + if (any(x$tableIsMissing, x$fieldIsMissing, x$tableIsEmpty, na.rm = TRUE)) { + return(1) + } + + # No NA status for measureValueCompleteness if empty + if (x$checkName == "measureValueCompleteness") { + return(0) + } + + if (any(x$fieldIsEmpty, x$conceptIsMissing, x$conceptAndUnitAreMissing, na.rm = TRUE)) { + return(1) + } + + return(0) +} + +#' Determines if check should be notApplicable and the notApplicableReason +#' +#' @param checkResults A dataframe containing the results of the data quality checks +#' +#' @keywords internal +.calculateNotApplicableStatus <- function(checkResults) { + # Look up missing tables and add variable tableIsMissing to checkResults + missingTables <- checkResults %>% + dplyr::filter( + .data$checkName == "cdmTable" + ) %>% + dplyr::mutate( + .data$cdmTableName, + tableIsMissing = .data$failed == 1, + .keep = "none" + ) + + # Look up missing fields and add variable fieldIsMissing to checkResults + missingFields <- checkResults %>% + dplyr::filter( + .data$checkName == "cdmField" + ) %>% + dplyr::mutate( + .data$cdmTableName, + .data$cdmFieldName, + fieldIsMissing = .data$failed == 1, + .keep = "none" + ) + + # Look up empty tables and add variable tableIsEmpty to checkResults + emptyTables <- checkResults %>% + dplyr::filter( + .data$checkName == "measureValueCompleteness" + ) %>% + dplyr::mutate( + .data$cdmTableName, + tableIsEmpty = .data$numDenominatorRows == 0, + .keep = "none" + ) %>% + dplyr::distinct() + + # Look up empty fields and add variable fieldIsEmpty to checkResults + emptyFields <- checkResults %>% + dplyr::filter( + .data$checkName == "measureValueCompleteness" + ) %>% + dplyr::mutate( + .data$cdmTableName, + .data$cdmFieldName, + fieldIsEmpty = .data$numDenominatorRows == .data$numViolatedRows, + .keep = "none" + ) + + # Assign notApplicable status + checkResults <- checkResults %>% + dplyr::left_join( + missingTables, + by = "cdmTableName" + ) %>% + dplyr::left_join( + missingFields, + by = c("cdmTableName", "cdmFieldName") + ) %>% + dplyr::left_join( + emptyTables, + by = "cdmTableName" + ) %>% + dplyr::left_join( + emptyFields, + by = c("cdmTableName", "cdmFieldName") + ) %>% + dplyr::mutate( + conceptIsMissing = .data$checkLevel == "CONCEPT" & is.na(.data$unitConceptId) & .data$numDenominatorRows == 0, + conceptAndUnitAreMissing = .data$checkLevel == "CONCEPT" & !is.na(.data$unitConceptId) & .data$numDenominatorRows == 0, + fieldIsMissing = dplyr::coalesce(.data$fieldIsMissing, !is.na(.data$cdmFieldName)), + fieldIsEmpty = dplyr::coalesce(.data$fieldIsEmpty, !is.na(.data$cdmFieldName)), + ) + + checkResults$notApplicable <- NA + checkResults$notApplicableReason <- NA + + conditionOccurrenceIsMissing <- missingTables %>% dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>% dplyr::pull(tableIsMissing) + conditionOccurrenceIsEmpty <- emptyTables %>% dplyr::filter(.data$cdmTableName == "CONDITION_OCCURRENCE") %>% dplyr::pull(tableIsEmpty) + for (i in seq_len(nrow(checkResults))) { + # Special rule for measureConditionEraCompleteness, which should be notApplicable if CONDITION_OCCURRENCE is empty + if (checkResults[i, "checkName"] == "measureConditionEraCompleteness") { + if (conditionOccurrenceIsMissing || conditionOccurrenceIsEmpty) { + checkResults$notApplicable[i] <- 1 + checkResults$notApplicableReason[i] <- "Table CONDITION_OCCURRENCE is empty." + } else { + checkResults$notApplicable[i] <- 0 + } + } else { + checkResults$notApplicable[i] <- .applyNotApplicable(checkResults[i, ]) + } + } + + checkResults <- checkResults %>% + dplyr::mutate( + notApplicableReason = ifelse( + .data$notApplicable == 1, + dplyr::case_when( + !is.na(.data$notApplicableReason) ~ .data$notApplicableReason, + .data$tableIsMissing ~ sprintf("Table %s does not exist.", .data$cdmTableName), + .data$fieldIsMissing ~ sprintf("Field %s.%s does not exist.", .data$cdmTableName, .data$cdmFieldName), + .data$tableIsEmpty ~ sprintf("Table %s is empty.", .data$cdmTableName), + .data$fieldIsEmpty ~ sprintf("Field %s.%s is not populated.", .data$cdmTableName, .data$cdmFieldName), + .data$conceptIsMissing ~ sprintf("%s=%s is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$cdmTableName), + .data$conceptAndUnitAreMissing ~ sprintf("Combination of %s=%s, unitConceptId=%s and VALUE_AS_NUMBER IS NOT NULL is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$unitConceptId, .data$cdmTableName) #nolint + ), + NA + ), + failed = ifelse(.data$notApplicable == 1, 0, .data$failed), + passed = ifelse(.data$failed == 0 & .data$isError == 0 & .data$notApplicable == 0, 1, 0) + ) %>% + dplyr::select(-c("tableIsMissing", "fieldIsMissing", "tableIsEmpty", "fieldIsEmpty", "conceptIsMissing", "conceptAndUnitAreMissing")) + + return(checkResults) +} diff --git a/R/evaluateThresholds.R b/R/evaluateThresholds.R index 21280ba0..904d0af4 100644 --- a/R/evaluateThresholds.R +++ b/R/evaluateThresholds.R @@ -163,119 +163,9 @@ } } - missingTables <- dplyr::select( - dplyr::filter(checkResults, .data$checkName == "cdmTable" & .data$failed == 1), - "cdmTableName" - ) - if (nrow(missingTables) > 0) { - missingTables$tableIsMissing <- 1 - checkResults <- dplyr::mutate( - dplyr::left_join(checkResults, missingTables, by = "cdmTableName"), - tableIsMissing = ifelse(.data$checkName != "cdmTable" & .data$isError == 0, .data$tableIsMissing, NA) - ) - } else { - checkResults$tableIsMissing <- NA + if (.hasNAchecks(checkResults)) { + checkResults <- .calculateNotApplicableStatus(checkResults) } - missingFields <- dplyr::select( - dplyr::filter(checkResults, .data$checkName == "cdmField" & .data$failed == 1 & is.na(.data$tableIsMissing)), - "cdmTableName", "cdmFieldName" - ) - if (nrow(missingFields) > 0) { - missingFields$fieldIsMissing <- 1 - checkResults <- dplyr::mutate( - dplyr::left_join(checkResults, missingFields, by = c("cdmTableName", "cdmFieldName")), - fieldIsMissing = ifelse(.data$checkName != "cdmField" & .data$isError == 0, .data$fieldIsMissing, NA) - ) - } else { - checkResults$fieldIsMissing <- NA - } - - emptyTables <- dplyr::distinct( - dplyr::select( - dplyr::filter(checkResults, .data$checkName == "measureValueCompleteness" & - .data$numDenominatorRows == 0 & - .data$isError == 0 & - is.na(.data$tableIsMissing) & - is.na(.data$fieldIsMissing)), - "cdmTableName" - ) - ) - if (nrow(emptyTables) > 0) { - emptyTables$tableIsEmpty <- 1 - checkResults <- dplyr::mutate( - dplyr::left_join(checkResults, emptyTables, by = c("cdmTableName")), - tableIsEmpty = ifelse(.data$checkName != "cdmField" & .data$checkName != "cdmTable" & .data$isError == 0, .data$tableIsEmpty, NA) - ) - } else { - checkResults$tableIsEmpty <- NA - } - - emptyFields <- - dplyr::select( - dplyr::filter(checkResults, .data$checkName == "measureValueCompleteness" & - .data$numDenominatorRows == .data$numViolatedRows & - is.na(.data$tableIsMissing) & is.na(.data$fieldIsMissing) & is.na(.data$tableIsEmpty)), - "cdmTableName", "cdmFieldName" - ) - if (nrow(emptyFields) > 0) { - emptyFields$fieldIsEmpty <- 1 - checkResults <- dplyr::mutate( - dplyr::left_join(checkResults, emptyFields, by = c("cdmTableName", "cdmFieldName")), - fieldIsEmpty = ifelse(.data$checkName != "measureValueCompleteness" & .data$checkName != "cdmField" & .data$checkName != "isRequired" & .data$isError == 0, .data$fieldIsEmpty, NA) - ) - } else { - checkResults$fieldIsEmpty <- NA - } - - checkResults <- dplyr::mutate( - checkResults, - conceptIsMissing = ifelse( - .data$isError == 0 & - is.na(.data$tableIsMissing) & - is.na(.data$fieldIsMissing) & - is.na(.data$tableIsEmpty) & - is.na(.data$fieldIsEmpty) & - .data$checkLevel == "CONCEPT" & - is.na(.data$unitConceptId) & - .data$numDenominatorRows == 0, - 1, - NA - ) - ) - - checkResults <- dplyr::mutate( - checkResults, - conceptAndUnitAreMissing = ifelse( - .data$isError == 0 & - is.na(.data$tableIsMissing) & - is.na(.data$fieldIsMissing) & - is.na(.data$tableIsEmpty) & - is.na(.data$fieldIsEmpty) & - .data$checkLevel == "CONCEPT" & - !is.na(.data$unitConceptId) & - .data$numDenominatorRows == 0, - 1, - NA - ) - ) - - checkResults <- dplyr::mutate( - checkResults, - notApplicable = dplyr::coalesce(.data$tableIsMissing, .data$fieldIsMissing, .data$tableIsEmpty, .data$fieldIsEmpty, .data$conceptIsMissing, .data$conceptAndUnitAreMissing, 0), - notApplicableReason = dplyr::case_when( - !is.na(.data$tableIsMissing) ~ sprintf("Table %s does not exist.", .data$cdmTableName), - !is.na(.data$fieldIsMissing) ~ sprintf("Field %s.%s does not exist.", .data$cdmTableName, .data$cdmFieldName), - !is.na(.data$tableIsEmpty) ~ sprintf("Table %s is empty.", .data$cdmTableName), - !is.na(.data$fieldIsEmpty) ~ sprintf("Field %s.%s is not populated.", .data$cdmTableName, .data$cdmFieldName), - !is.na(.data$conceptIsMissing) ~ sprintf("%s=%s is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$cdmTableName), - !is.na(.data$conceptAndUnitAreMissing) ~ sprintf("Combination of %s=%s, unitConceptId=%s and VALUE_AS_NUMBER IS NOT NULL is missing from the %s table.", .data$cdmFieldName, .data$conceptId, .data$unitConceptId, .data$cdmTableName) - ) - ) - - checkResults <- dplyr::select(checkResults, -c("tableIsMissing", "fieldIsMissing", "tableIsEmpty", "fieldIsEmpty", "conceptIsMissing", "conceptAndUnitAreMissing")) - checkResults <- dplyr::mutate(checkResults, failed = ifelse(.data$notApplicable == 1, 0, .data$failed)) - checkResults <- dplyr::mutate(checkResults, passed = ifelse(.data$failed == 0 & .data$isError == 0 & .data$notApplicable == 0, 1, 0)) - checkResults } diff --git a/R/executeDqChecks.R b/R/executeDqChecks.R index 7bb40e75..e153b3c9 100644 --- a/R/executeDqChecks.R +++ b/R/executeDqChecks.R @@ -114,16 +114,10 @@ executeDqChecks <- function(connectionDetails, stopifnot(is.character(cdmVersion)) # Warning if check names for determining NA is missing - if (!length(checkNames) == 0) { - naCheckNames <- c("cdmTable", "cdmField", "measureValueCompleteness") - missingNAChecks <- !(naCheckNames %in% checkNames) - if (any(missingNAChecks)) { - missingNACheckNames <- paste(naCheckNames[missingNAChecks], collapse = ", ") - warning(sprintf("Missing check names to calculate the 'Not Applicable' status: %s", missingNACheckNames)) - } + if (length(checkNames) > 0 && !.containsNAchecks(checkNames)) { + warning("Missing check names to calculate the 'Not Applicable' status.") } - # temporary patch to work around vroom 1.6.4 bug readr::local_edition(1) diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index f1aa8636..761fdeab 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -12,3 +12,6 @@ if (Sys.getenv("DONT_DOWNLOAD_JDBC_DRIVERS", "") == "TRUE") { connectionDetailsEunomia <- Eunomia::getEunomiaConnectionDetails() cdmDatabaseSchemaEunomia <- "main" resultsDatabaseSchemaEunomia <- "main" + +# Separate connection details for NA tests, as this requires removing records +connectionDetailsEunomiaNaChecks <- Eunomia::getEunomiaConnectionDetails() \ No newline at end of file diff --git a/tests/testthat/test-calculateNotApplicableStatus.R b/tests/testthat/test-calculateNotApplicableStatus.R new file mode 100644 index 00000000..0f3bc64c --- /dev/null +++ b/tests/testthat/test-calculateNotApplicableStatus.R @@ -0,0 +1,91 @@ +library(testthat) + +test_that("Not Applicable status Table Empty", { + outputFolder <- tempfile("dqd_") + on.exit(unlink(outputFolder, recursive = TRUE)) + + # Make sure the device exposure table is empty + connection <- DatabaseConnector::connect(connectionDetailsEunomiaNaChecks) + DatabaseConnector::executeSql(connection, "DELETE FROM DEVICE_EXPOSURE;") + DatabaseConnector::disconnect(connection) + + results <- executeDqChecks( + connectionDetails = connectionDetailsEunomiaNaChecks, + cdmDatabaseSchema = cdmDatabaseSchemaEunomia, + resultsDatabaseSchema = resultsDatabaseSchemaEunomia, + cdmSourceName = "Eunomia", + checkNames = c("cdmTable", "cdmField", "measureValueCompleteness"), + # Eunomia COST table has misspelled 'REVEUE_CODE_SOURCE_VALUE' + tablesToExclude = c("COST", "CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN"), + outputFolder = outputFolder, + writeToTable = FALSE + ) + + r <- results$CheckResults[results$CheckResults$checkName == "measureValueCompleteness" & + results$CheckResults$tableName == "device_exposure", ] + expect_true(all(r$notApplicable == 1)) +}) + +test_that("measureConditionEraCompleteness Not Applicable if condition_occurrence empty", { + outputFolder <- tempfile("dqd_") + on.exit(unlink(outputFolder, recursive = TRUE)) + + # Remove records from Condition Occurrence + connection <- DatabaseConnector::connect(connectionDetailsEunomiaNaChecks) + DatabaseConnector::executeSql(connection, "CREATE TABLE CONDITION_OCCURRENCE_BACK AS SELECT * FROM CONDITION_OCCURRENCE;") + DatabaseConnector::executeSql(connection, "DELETE FROM CONDITION_OCCURRENCE;") + DatabaseConnector::disconnect(connection) + + results <- executeDqChecks( + connectionDetails = connectionDetailsEunomiaNaChecks, + cdmDatabaseSchema = cdmDatabaseSchemaEunomia, + resultsDatabaseSchema = resultsDatabaseSchemaEunomia, + cdmSourceName = "Eunomia", + checkNames = c("cdmTable", "cdmField", "measureValueCompleteness", "measureConditionEraCompleteness"), + # Eunomia COST table has misspelled 'REVEUE_CODE_SOURCE_VALUE' + tablesToExclude = c("COST", "CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN"), + outputFolder = outputFolder, + writeToTable = FALSE + ) + + # Reinstate Condition Occurrence + connection <- DatabaseConnector::connect(connectionDetailsEunomiaNaChecks) + DatabaseConnector::executeSql(connection, "INSERT INTO CONDITION_OCCURRENCE SELECT * FROM CONDITION_OCCURRENCE_BACK;") + DatabaseConnector::executeSql(connection, "DROP TABLE CONDITION_OCCURRENCE_BACK;") + disconnect(connection) + + r <- results$CheckResults[results$CheckResults$checkName == "measureConditionEraCompleteness", ] + expect_true(r$notApplicable == 1) +}) + +test_that("measureConditionEraCompleteness Fails if condition_era empty", { + outputFolder <- tempfile("dqd_") + on.exit(unlink(outputFolder, recursive = TRUE)) + + # Remove records from Condition Era + connection <- DatabaseConnector::connect(connectionDetailsEunomiaNaChecks) + DatabaseConnector::executeSql(connection, "CREATE TABLE CONDITION_ERA_BACK AS SELECT * FROM CONDITION_ERA;") + DatabaseConnector::executeSql(connection, "DELETE FROM CONDITION_ERA;") + DatabaseConnector::disconnect(connection) + + results <- executeDqChecks( + connectionDetails = connectionDetailsEunomiaNaChecks, + cdmDatabaseSchema = cdmDatabaseSchemaEunomia, + resultsDatabaseSchema = resultsDatabaseSchemaEunomia, + cdmSourceName = "Eunomia", + checkNames = c("cdmTable", "cdmField", "measureValueCompleteness", "measureConditionEraCompleteness"), + # Eunomia COST table has misspelled 'REVEUE_CODE_SOURCE_VALUE' + tablesToExclude = c("COST", "CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN"), + outputFolder = outputFolder, + writeToTable = FALSE + ) + + # Reinstate the Condition Era + connection <- DatabaseConnector::connect(connectionDetailsEunomiaNaChecks) + DatabaseConnector::executeSql(connection, "INSERT INTO CONDITION_ERA SELECT * FROM CONDITION_ERA_BACK;") + DatabaseConnector::executeSql(connection, "DROP TABLE CONDITION_ERA_BACK;") + DatabaseConnector::disconnect(connection) + + r <- results$CheckResults[results$CheckResults$checkName == "measureConditionEraCompleteness", ] + expect_true(r$failed == 1) +}) \ No newline at end of file