Executing Data Quality Checks# which DQ checks to run? ------------------------------------
checkNames<-c()# Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv
+# want to EXCLUDE a pre-specified list of checks? run the following code:
+#
+# checksToExclude <- c() # Names of check types to exclude from your DQD run
+# allChecks <- DataQualityDashboard::listDqChecks()
+# checkNames <- allChecks$checkDescriptions %>%
+# subset(!(checkName %in% checksToExclude)) %>%
+# select(checkName)
+
# which CDM tables to exclude? ------------------------------------tablesToExclude<-c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN")# list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables
@@ -226,11 +234,13 @@
Requires R (version 3.2.2 or higher). Requires DatabaseConnector (version 2.0.2 or higher).
+
A variety of database platforms are supported, as documented here.
+
Note that while data quality check threshold files are provided for OMOP CDM versions 5.2, 5.3, and 5.4, the package is currently only tested against versions 5.3 and 5.4.
This function will connect to the database, generate the sql scripts, and run the data quality checks against the database.
+
This function will connect to the database, generate the sql scripts, and run the data quality checks against the database. By default, results will be written to a json file as well as a database table.
diff --git a/docs/reference/listDqChecks.html b/docs/reference/listDqChecks.html
index fa949f69..ef17b90d 100644
--- a/docs/reference/listDqChecks.html
+++ b/docs/reference/listDqChecks.html
@@ -17,7 +17,7 @@
DataQualityDashboard
- 2.4.0
+ 2.4.1
diff --git a/docs/reference/reEvaluateThresholds.html b/docs/reference/reEvaluateThresholds.html
index c23b381b..0a7df4a7 100644
--- a/docs/reference/reEvaluateThresholds.html
+++ b/docs/reference/reEvaluateThresholds.html
@@ -17,7 +17,7 @@
DataQualityDashboard
- 2.4.0
+ 2.4.1
diff --git a/docs/reference/viewDqDashboard.html b/docs/reference/viewDqDashboard.html
index b8e38e32..41557122 100644
--- a/docs/reference/viewDqDashboard.html
+++ b/docs/reference/viewDqDashboard.html
@@ -17,7 +17,7 @@
DataQualityDashboard
- 2.4.0
+ 2.4.1
diff --git a/docs/reference/writeJsonResultsToCsv.html b/docs/reference/writeJsonResultsToCsv.html
index 904f171f..3083ad75 100644
--- a/docs/reference/writeJsonResultsToCsv.html
+++ b/docs/reference/writeJsonResultsToCsv.html
@@ -17,7 +17,7 @@
DataQualityDashboard
- 2.4.0
+ 2.4.1
diff --git a/docs/reference/writeJsonResultsToTable.html b/docs/reference/writeJsonResultsToTable.html
index 8f1cff77..43b428f9 100644
--- a/docs/reference/writeJsonResultsToTable.html
+++ b/docs/reference/writeJsonResultsToTable.html
@@ -17,7 +17,7 @@
DataQualityDashboard
- 2.4.0
+ 2.4.1
diff --git a/extras/DataQualityDashboard.pdf b/extras/DataQualityDashboard.pdf
index 71a5842d..24a693fa 100644
Binary files a/extras/DataQualityDashboard.pdf and b/extras/DataQualityDashboard.pdf differ
diff --git a/extras/DevelopersREADME.md b/extras/DevelopersREADME.md
new file mode 100644
index 00000000..629d8d08
--- /dev/null
+++ b/extras/DevelopersREADME.md
@@ -0,0 +1,69 @@
+DQD Developers README
+====================
+
+Dev Setup
+====================
+1. R setup: https://ohdsi.github.io/Hades/rSetup.html
+
+2. Local OMOP CDM setup
+
+ If you already have a CDM available for development work/testing, you may skip this step
+
+ a. Install Postgres and create a localhost server
+
+ b. Create a new database in localhost for your test CDM, and create a schema in that database for the CDM tables
+
+ c. Using the [CDMConnector](https://odyosg.github.io/CDMConnector/index.html) package:
+
+ - i. Download a sample OMOP CDM into a DuckDB database, as documented [here](https://odyosg.github.io/CDMConnector/reference/eunomiaDir.html)
+
+ - ii. Copy the CDM into your local Postgres database, as documented [here](https://odyosg.github.io/CDMConnector/reference/copy_cdm_to.html)
+
+
+3. Fork the DataQualityDashboard repo
+
+ 4. Clone your fork to your computer
+
+
+
+
+PR Process
+====================
+
+Be sure you're aware of our Pull Request guidelines before diving into development work: https://github.com/OHDSI/DataQualityDashboard/blob/main/.github/pull_request_template.md
+1. Sync your fork's develop branch with the upstream repo
+2. Check out and pull the develop branch of your fork
+3. Create a new branch named (briefly) according to the issue being fixed / feature being added
+
+ a. If possible, limit the changes made on each branch to those needed for a single GitHub issue
+
+ b. If an issue or new feature requires extensive changes, split your work across multiple sub-branches off of your feature branch, or across multiple feature branches
+4. Make your changes
+
+ a. If you are adding new functionality, you must add unit tests to cover the new function(s)/code
+
+ b. If you are fixing a bug, you must add a unit test for the regression
+5. Run R CMD Check and resolve all errors, warnings, and notes
+
+ a. At the time of writing, the NOTE regarding the size of the package is expected and does not need to be resolved
+6. Run `test_file(path = "tests/testthat/test-executeDqChecks.R")` and resolve all test failures
+
+ a. This file contains tests using testthat's snapshot feature, which do not work when tests are run via R CMD Check
+
+ b. See testthat docs to learn more about snapshots and how to resolve snapshot test failures: https://testthat.r-lib.org/articles/snapshotting.html
+7. Build & install the package locally, then run DQD against your local Postgres database and view the results. Resolve any errors that arise
+8. Commit your changes and push them to GitHub
+9. Back on GitHub, open up a PR for your changes, making sure to set the target branch to the `develop` branch of the parent OHDSI/DataQualityDashboard repo
+10. Wait for the automated checks to complete
+
+ a. If they all succeed, your PR is ready for review!
+
+ b. If any checks fail, check the logs and address errors in your code by repeating steps 4-7 above
+11. Once your PR is approved by a maintainer, you may merge it into the `develop` branch
+
+General Guidance
+====================
+HADES Developer Guidelines: https://ohdsi.github.io/Hades/developerGuidelines.html
+HADES Code Style Requirements: https://ohdsi.github.io/Hades/codeStyle.html
+HADES Release Process: https://ohdsi.github.io/Hades/releaseProcess.html
+
\ No newline at end of file
diff --git a/extras/codeToRun.R b/extras/codeToRun.R
index 4807d3c2..81cc0e45 100644
--- a/extras/codeToRun.R
+++ b/extras/codeToRun.R
@@ -19,11 +19,11 @@ library(DatabaseConnector)
# fill out the connection details -----------------------------------------------------------------------
connectionDetails <- DatabaseConnector::createConnectionDetails(
- dbms = "",
- user = "",
- password = "",
- server = "",
- port = "",
+ dbms = "",
+ user = "",
+ password = "",
+ server = "",
+ port = "",
extraSettings = "",
pathToDriver = ""
)
@@ -31,30 +31,47 @@ connectionDetails <- DatabaseConnector::createConnectionDetails(
cdmDatabaseSchema <- "yourCdmSchema" # the fully qualified database schema name of the CDM
resultsDatabaseSchema <- "yourResultsSchema" # the fully qualified database schema name of the results schema (that you can write to)
cdmSourceName <- "Your CDM Source" # a human readable name for your CDM source
-cdmVersion <- "5.4" # the CDM version you are targetting. Currently supporst 5.2.2, 5.3.1, and 5.4
+cdmVersion <- "5.4" # the CDM version you are targetting. Currently supports 5.2, 5.3, and 5.4
# determine how many threads (concurrent SQL sessions) to use ----------------------------------------
numThreads <- 1 # on Redshift, 3 seems to work well
# specify if you want to execute the queries or inspect them ------------------------------------------
-sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries. See codeToRun_sqlOnly.R for other sqlOnly parameters
+sqlOnly <- FALSE # set to TRUE if you just want to get the SQL scripts and not actually run the queries
+sqlOnlyIncrementalInsert <- FALSE # set to TRUE if you want the generated SQL queries to calculate DQD results and insert them into a database table (@resultsDatabaseSchema.@writeTableName)
+sqlOnlyUnionCount <- 1 # in sqlOnlyIncrementalInsert mode, the number of check sqls to union in a single query; higher numbers can improve performance in some DBMS (e.g. a value of 25 may be 25x faster)
+
+# NOTES specific to sqlOnly <- TRUE option ------------------------------------------------------------
+# 1. You do not need a live database connection. Instead, connectionDetails only needs these parameters:
+# connectionDetails <- DatabaseConnector::createConnectionDetails(
+# dbms = "", # specify your dbms
+# pathToDriver = "/"
+# )
+# 2. Since these are fully functional queries, this can help with debugging.
+# 3. In the results output by the sqlOnlyIncrementalInsert queries, placeholders are populated for execution_time, query_text, and warnings/errors; and the NOT_APPLICABLE rules are not applied.
+# 4. In order to use the generated SQL to insert metadata and check results into output table, you must set sqlOnlyIncrementalInsert = TRUE. Otherwise sqlOnly is backwards compatable with <= v2.2.0, generating queries which run the checks but don't store the results.
# where should the results and logs go? ----------------------------------------------------------------
outputFolder <- "output"
outputFile <- "results.json"
+
# logging type -------------------------------------------------------------------------------------
verboseMode <- TRUE # set to FALSE if you don't want the logs to be printed to the console
-# write results to table? -----------------------------------------------------------------------
-writeToTable <- FALSE # set to TRUE if you want to write to a SQL table in the results schema
+# write results to table? ------------------------------------------------------------------------------
+writeToTable <- TRUE # set to FALSE if you want to skip writing to a SQL table in the results schema
+
+# specify the name of the results table (used when writeToTable = TRUE and when sqlOnlyIncrementalInsert = TRUE)
+writeTableName <- "dqdashboard_results"
# write results to a csv file? -----------------------------------------------------------------------
writeToCsv <- FALSE # set to FALSE if you want to skip writing to csv file
csvFile <- "" # only needed if writeToCsv is set to TRUE
# if writing to table and using Redshift, bulk loading can be initialized -------------------------------
+
# Sys.setenv("AWS_ACCESS_KEY_ID" = "",
# "AWS_SECRET_ACCESS_KEY" = "",
# "AWS_DEFAULT_REGION" = "",
@@ -67,46 +84,46 @@ csvFile <- "" # only needed if writeToCsv is set to TRUE
checkLevels <- c("TABLE", "FIELD", "CONCEPT")
# which DQ checks to run? ------------------------------------
-checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3.1_Check_Desciptions.csv
+checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv
+
+# want to EXCLUDE a pre-specified list of checks? run the following code:
+#
+# checksToExclude <- c() # Names of check types to exclude from your DQD run
+# allChecks <- DataQualityDashboard::listDqChecks()
+# checkNames <- allChecks$checkDescriptions %>%
+# subset(!(checkName %in% checksToExclude)) %>%
+# select(checkName)
# which CDM tables to exclude? ------------------------------------
-tablesToExclude <- c()
+tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables
# run the job --------------------------------------------------------------------------------------
-DataQualityDashboard::executeDqChecks(
- connectionDetails = connectionDetails,
- cdmDatabaseSchema = cdmDatabaseSchema,
- resultsDatabaseSchema = resultsDatabaseSchema,
- cdmSourceName = cdmSourceName,
- cdmVersion = cdmVersion
- numThreads = numThreads,
- sqlOnly = sqlOnly,
- outputFolder = outputFolder,
- outputFile = outputFile,
- verboseMode = verboseMode,
- writeToTable = writeToTable,
- writeToCsv = writeToCsv,
- csvFile = csvFile,
- checkLevels = checkLevels,
- tablesToExclude = tablesToExclude,
- checkNames = checkNames
-)
+DataQualityDashboard::executeDqChecks(connectionDetails = connectionDetails,
+ cdmDatabaseSchema = cdmDatabaseSchema,
+ resultsDatabaseSchema = resultsDatabaseSchema,
+ cdmSourceName = cdmSourceName,
+ cdmVersion = cdmVersion,
+ numThreads = numThreads,
+ sqlOnly = sqlOnly,
+ sqlOnlyUnionCount = sqlOnlyUnionCount,
+ sqlOnlyIncrementalInsert = sqlOnlyIncrementalInsert,
+ outputFolder = outputFolder,
+ outputFile = outputFile,
+ verboseMode = verboseMode,
+ writeToTable = writeToTable,
+ writeToCsv = writeToCsv,
+ csvFile = csvFile,
+ checkLevels = checkLevels,
+ tablesToExclude = tablesToExclude,
+ checkNames = checkNames)
# inspect logs ----------------------------------------------------------------------------
-ParallelLogger::launchLogViewer(
- logFileName = file.path(outputFolder,
- sprintf("log_DqDashboard_%s.txt", cdmSourceName))
-)
-
-# View the Data Quality Dashboard using the integrated shiny application ------------------------------------
-DataQualityDashboard::viewDqDashboard(
- jsonPath = file.path(getwd(), outputFolder, outputFile)
-)
+ParallelLogger::launchLogViewer(logFileName = file.path(outputFolder, cdmSourceName,
+ sprintf("log_DqDashboard_%s.txt", cdmSourceName)))
# (OPTIONAL) if you want to write the JSON file to the results table separately -----------------------------
-jsonFilePath <- "" # put the path to the outputted JSON file
-DataQualityDashboard::writeJsonResultsToTable(
- connectionDetails = connectionDetails,
- resultsDatabaseSchema = resultsDatabaseSchema,
- jsonFilePath = jsonFilePath
-)
+jsonFilePath <- ""
+DataQualityDashboard::writeJsonResultsToTable(connectionDetails = connectionDetails,
+ resultsDatabaseSchema = resultsDatabaseSchema,
+ jsonFilePath = jsonFilePath)
+
diff --git a/inst/doc/AddNewCheck.pdf b/inst/doc/AddNewCheck.pdf
index 50f6d97c..ff24ed57 100644
Binary files a/inst/doc/AddNewCheck.pdf and b/inst/doc/AddNewCheck.pdf differ
diff --git a/inst/doc/CheckStatusDefinitions.pdf b/inst/doc/CheckStatusDefinitions.pdf
index 0b2209f7..b158d7cb 100644
Binary files a/inst/doc/CheckStatusDefinitions.pdf and b/inst/doc/CheckStatusDefinitions.pdf differ
diff --git a/inst/doc/CheckTypeDescriptions.pdf b/inst/doc/CheckTypeDescriptions.pdf
index 6fddddff..4fc209fa 100644
Binary files a/inst/doc/CheckTypeDescriptions.pdf and b/inst/doc/CheckTypeDescriptions.pdf differ
diff --git a/inst/doc/DataQualityDashboard.pdf b/inst/doc/DataQualityDashboard.pdf
index 89042448..9d1faceb 100644
Binary files a/inst/doc/DataQualityDashboard.pdf and b/inst/doc/DataQualityDashboard.pdf differ
diff --git a/inst/doc/DqdForCohorts.pdf b/inst/doc/DqdForCohorts.pdf
index d692f689..dc381e31 100644
Binary files a/inst/doc/DqdForCohorts.pdf and b/inst/doc/DqdForCohorts.pdf differ
diff --git a/inst/doc/SqlOnly.pdf b/inst/doc/SqlOnly.pdf
index 4f9f5637..f9107177 100644
Binary files a/inst/doc/SqlOnly.pdf and b/inst/doc/SqlOnly.pdf differ
diff --git a/inst/doc/Thresholds.pdf b/inst/doc/Thresholds.pdf
index 6775e554..bb044544 100644
Binary files a/inst/doc/Thresholds.pdf and b/inst/doc/Thresholds.pdf differ
diff --git a/man/executeDqChecks.Rd b/man/executeDqChecks.Rd
index 1ae7d2e2..0b7b5f7f 100644
--- a/man/executeDqChecks.Rd
+++ b/man/executeDqChecks.Rd
@@ -92,5 +92,5 @@ with the fields cohort_definition_id and subject_id.}
If sqlOnly = FALSE, a list object of results
}
\description{
-This function will connect to the database, generate the sql scripts, and run the data quality checks against the database.
+This function will connect to the database, generate the sql scripts, and run the data quality checks against the database. By default, results will be written to a json file as well as a database table.
}
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index 16ae8c13..f1aa8636 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -6,17 +6,9 @@ if (Sys.getenv("DONT_DOWNLOAD_JDBC_DRIVERS", "") == "TRUE") {
downloadJdbcDrivers("postgresql", jdbcDriverFolder)
downloadJdbcDrivers("sql server", jdbcDriverFolder)
downloadJdbcDrivers("oracle", jdbcDriverFolder)
+ downloadJdbcDrivers("redshift", jdbcDriverFolder)
}
connectionDetailsEunomia <- Eunomia::getEunomiaConnectionDetails()
cdmDatabaseSchemaEunomia <- "main"
resultsDatabaseSchemaEunomia <- "main"
-
-remove_sql_comments <- function(sql) {
- sql0 <- gsub("--.*?\\n|--.*?\\r", " ", sql) # remove single-line SQL comments
- sql1 <- gsub("\\r|\\n|\\t", " ", sql0) # convert tabs and newlines to spaces
- sql2 <- gsub("/*", "@@@@ ", sql1, fixed = TRUE) # must add spaces between multi-line comments for quote removal to work
- sql3 <- gsub("*/", " @@@@", sql2, fixed = TRUE) # must add spaces between multi-line comments for quote removal to work
- sql4 <- gsub("@@@@ .+? @@@@", " ", sql3, ) # remove multi-line comments
- sql5 <- gsub("\\s+", " ", sql4) # remove multiple spaces
-}
diff --git a/tests/testthat/test-executeDqChecks.R b/tests/testthat/test-executeDqChecks.R
index e4183cbd..01cb11b3 100644
--- a/tests/testthat/test-executeDqChecks.R
+++ b/tests/testthat/test-executeDqChecks.R
@@ -1,5 +1,5 @@
library(testthat)
-local_edition(3)
+testthat::local_edition(3)
test_that("Execute a single DQ check on Synthea/Eunomia", {
outputFolder <- tempfile("dqd_")
@@ -119,7 +119,8 @@ test_that("Execute a single DQ check on remote databases", {
dbTypes <- c(
"oracle",
"postgresql",
- "sql server"
+ "sql server",
+ "redshift"
)
for (dbType in dbTypes) {
@@ -129,7 +130,7 @@ test_that("Execute a single DQ check on remote databases", {
if (sysUser != "" &
sysPassword != "" &
sysServer != "") {
- cdmDatabaseSchema <- Sys.getenv(sprintf("CDM5_%s_CDM_SCHEMA", toupper(gsub(" ", "_", dbType))))
+ cdmDatabaseSchema <- Sys.getenv(sprintf("CDM5_%s_CDM54_SCHEMA", toupper(gsub(" ", "_", dbType))))
resultsDatabaseSchema <- Sys.getenv("CDM5_%s_OHDSI_SCHEMA", toupper(gsub(" ", "_", dbType)))
connectionDetails <- createConnectionDetails(
@@ -151,7 +152,8 @@ test_that("Execute a single DQ check on remote databases", {
outputFolder = outputFolder,
verboseMode = FALSE,
writeToTable = FALSE,
- checkNames = "measurePersonCompleteness"
+ checkNames = "measurePersonCompleteness",
+ cdmVersion = "5.4"
),
regexp = "^Missing check names.*"
)
diff --git a/vignettes/DataQualityDashboard.rmd b/vignettes/DataQualityDashboard.rmd
index f9c4ada0..1a60e78a 100644
--- a/vignettes/DataQualityDashboard.rmd
+++ b/vignettes/DataQualityDashboard.rmd
@@ -111,6 +111,14 @@ checkLevels <- c("TABLE", "FIELD", "CONCEPT")
# which DQ checks to run? ------------------------------------
checkNames <- c() # Names can be found in inst/csv/OMOP_CDM_v5.3_Check_Descriptions.csv
+# want to EXCLUDE a pre-specified list of checks? run the following code:
+#
+# checksToExclude <- c() # Names of check types to exclude from your DQD run
+# allChecks <- DataQualityDashboard::listDqChecks()
+# checkNames <- allChecks$checkDescriptions %>%
+# subset(!(checkName %in% checksToExclude)) %>%
+# select(checkName)
+
# which CDM tables to exclude? ------------------------------------
tablesToExclude <- c("CONCEPT", "VOCABULARY", "CONCEPT_ANCESTOR", "CONCEPT_RELATIONSHIP", "CONCEPT_CLASS", "CONCEPT_SYNONYM", "RELATIONSHIP", "DOMAIN") # list of CDM table names to skip evaluating checks against; by default DQD excludes the vocab tables
@@ -119,11 +127,13 @@ DataQualityDashboard::executeDqChecks(connectionDetails = connectionDetails,
cdmDatabaseSchema = cdmDatabaseSchema,
resultsDatabaseSchema = resultsDatabaseSchema,
cdmSourceName = cdmSourceName,
+ cdmVersion = cdmVersion,
numThreads = numThreads,
sqlOnly = sqlOnly,
sqlOnlyUnionCount = sqlOnlyUnionCount,
sqlOnlyIncrementalInsert = sqlOnlyIncrementalInsert,
outputFolder = outputFolder,
+ outputFile = outputFile,
verboseMode = verboseMode,
writeToTable = writeToTable,
writeToCsv = writeToCsv,