OHDSI · alondhe · Aug 5, 2022 · Aug 5, 2022 · Aug 5, 2022 · Aug 5, 2022
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -15,7 +15,7 @@ VignetteBuilder: knitr
 URL: https://github.com/OHDSI/AresIndexer
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.1
+RoxygenNote: 7.2.3
 Depends:
   R (>= 3.5.0)
 Imports:
@@ -30,3 +30,5 @@ Imports:
 Suggests: 
     knitr,
     rmarkdown
+Remotes:
+  ohdsi/Achilles
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,7 +8,9 @@ export(buildExportQueryIndex)
 export(buildNetworkIndex)
 export(buildNetworkPerformanceIndex)
 export(buildNetworkUnmappedSourceCodeIndex)
+export(computeCharacterizationDifference)
 export(getSourceReleaseKey)
+import(data.table)
 import(dplyr)
 import(jsonlite)
 import(lubridate)

diff --git a/R/AugmentConceptFiles.R b/R/AugmentConceptFiles.R
@@ -36,7 +36,8 @@ augmentConceptFiles <- function(releaseFolder) {
   if (file.exists(dataQualityResultsFile)) {
     writeLines("updating concept files with data quality results")
     dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile)
-    results <- dataQualityResults$CheckResults
+    results <- dataQualityResults$CheckResults %>%
+      dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper)
 
     # augment achilles concept files with data quality failure count for relevant concept checks
     conceptAggregates <- results %>% filter(!is.na(results$CONCEPT_ID) && results$FAILED==1) %>% count(CONCEPT_ID,tolower(CDM_TABLE_NAME))

diff --git a/R/BuildDataQualityHistoryIndex.R b/R/BuildDataQualityHistoryIndex.R
@@ -32,16 +32,19 @@ buildDataQualityHistoryIndex <-
     stratified_index <- data.table::data.table()
 
     addResultsToIndex <- function(json) {
-      cdm_source_name <- json$Metadata[1,"CDM_SOURCE_NAME"]
-      cdm_source_abbreviation <- json$Metadata[1,"CDM_SOURCE_ABBREVIATION"]
-      vocabulary_version <- json$Metadata[1,"VOCABULARY_VERSION"]
-      cdm_release_date <- format(lubridate::ymd(json$Metadata[1,"CDM_RELEASE_DATE"]),"%Y-%m-%d")
+      thisMetadata <- json$Metadata %>%
+        dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper)
+      cdm_source_name <- thisMetadata[1,"CDM_SOURCE_NAME"]
+      cdm_source_abbreviation <- thisMetadata[1,"CDM_SOURCE_ABBREVIATION"]
+      vocabulary_version <- thisMetadata[1,"VOCABULARY_VERSION"]
+      cdm_release_date <- format(lubridate::ymd(thisMetadata[1,"CDM_RELEASE_DATE"]),"%Y-%m-%d")
       count_passed <- as.numeric(json$Overview$countPassed)
       count_failed <- as.numeric(json$Overview$countOverallFailed)
       count_total <- count_passed + count_failed
       dqd_execution_date <- format(lubridate::ymd_hms(json$endTimestamp),"%Y-%m-%d")
 
       stratifiedAggregates <- json$CheckResults %>%
+        dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper) %>%
         filter(FAILED==1) %>%
         group_by(CATEGORY, toupper(CDM_TABLE_NAME)) %>%
         summarise(count_value=n())

diff --git a/R/BuildDataQualityIndex.R b/R/BuildDataQualityIndex.R
@@ -38,7 +38,7 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) {
 
   # iterate on sources
   for (sourceFolder in sourceFolders) {
-    historicalIndex <- AresIndexer::buildDataQualityHistoryIndex(sourceFolder)
+    historicalIndex <- buildDataQualityHistoryIndex(sourceFolder)
     historicalFile <- file.path(sourceFolder, "data-quality-index.json")
     write(jsonlite::toJSON(historicalIndex),historicalFile)
 
@@ -51,20 +51,30 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) {
       # process each data quality result file
       if (file.exists(dataQualityResultsFile)) {
         dataQualityResults <- jsonlite::fromJSON(dataQualityResultsFile)
-        results <- dataQualityResults$CheckResults
+        results <- dataQualityResults$CheckResults %>%
+          dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper)
+
+        metadata <- dataQualityResults$Metadata %>%
+          dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper)
 
         # for each release, generate a summary of failures by cdm_table_name
         domainAggregates <- results %>% filter(FAILED==1) %>% count(tolower(CDM_TABLE_NAME))
         names(domainAggregates) <- c("cdm_table_name", "count_failed")
         data.table::fwrite(domainAggregates, file.path(releaseFolder,"domain-issues.csv"))
 
         # collect all failures from this result file for network analysis
-        sourceFailures <- results[results[,"FAILED"]==1,c("CHECK_NAME", "CHECK_LEVEL", "CDM_TABLE_NAME", "CATEGORY", "SUBCATEGORY", "CONTEXT", "CDM_FIELD_NAME", "CONCEPT_ID", "UNIT_CONCEPT_ID")]
-        sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$CDM_SOURCE_NAME
-        sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION
-        sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION)
-        sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y-%m-%d")
-        sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y%m%d")
+        outColNames <- c("CHECK_NAME", "CHECK_LEVEL", "CDM_TABLE_NAME", "CATEGORY", "SUBCATEGORY", "CONTEXT", "CDM_FIELD_NAME", "CONCEPT_ID", "UNIT_CONCEPT_ID")
+        missingColNames <- setdiff(outColNames, names(results))
+        for (colName in missingColNames) {
+          writeLines(paste0("Expected column is missing in DQD results. Adding column with NA values: ", colName))
+          results[,colName] <- NA
+        }
+        sourceFailures <- results[results[,"FAILED"]==1,outColNames]
+        sourceFailures$CDM_SOURCE_NAME <- metadata$CDM_SOURCE_NAME
+        sourceFailures$CDM_SOURCE_ABBREVIATION <- metadata$CDM_SOURCE_ABBREVIATION
+        sourceFailures$CDM_SOURCE_KEY <- gsub(" ", "_", metadata$CDM_SOURCE_ABBREVIATION)
+        sourceFailures$RELEASE_NAME <- format(lubridate::ymd(metadata$CDM_RELEASE_DATE),"%Y-%m-%d")
+        sourceFailures$RELEASE_ID <- format(lubridate::ymd(metadata$CDM_RELEASE_DATE),"%Y%m%d")
         networkIndex <- rbind(networkIndex, sourceFailures)
       } else {
         writeLines(paste("missing data quality result file ",dataQualityResultsFile))

diff --git a/R/BuildExportQueryIndex.R b/R/BuildExportQueryIndex.R
@@ -184,6 +184,8 @@ buildExportQueryIndex <-
 
    quality_completeness <- "export/quality/sqlCompletenessTable.sql"
 
+   temporal_characterization <- "temporal/achilles_temporal_data.sql"
+
    sqlFilesIndex <- list(
      "PROCEDURE_OCCURRENCE" = procedure_occurrence,
      "PERSON" = person,
@@ -203,7 +205,8 @@ buildExportQueryIndex <-
      "MEASUREMENT" = measurement,
      "DOMAIN_SUMMARY" = domain_summary,
      "DATA_DENSITY" = data_density,
-     "QUALITY_COMPLETENESS" = quality_completeness
+     "QUALITY_COMPLETENESS" = quality_completeness,
+     "TEMPORAL_CHARACTERIZATION" = temporal_characterization
 
    )
    jsonOutput = jsonlite::toJSON(sqlFilesIndex)

diff --git a/R/BuildNetworkIndex.R b/R/BuildNetworkIndex.R
@@ -94,20 +94,23 @@ buildNetworkIndex <- function(sourceFolders, outputFolder) {
 				  writeLines(paste("missing observation period results file ", observationPeriodResultsFile))
 				}
 
-				source$cdm_source_name <- dataQualityResults$Metadata$CDM_SOURCE_NAME
-				source$cdm_source_abbreviation <- dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION
+				thisMetadata <- dataQualityResults$Metadata %>%
+				  dplyr::rename_with(SqlRender::camelCaseToSnakeCase) %>% dplyr::rename_with(toupper)
+
+				source$cdm_source_name <- thisMetadata$CDM_SOURCE_NAME
+				source$cdm_source_abbreviation <- thisMetadata$CDM_SOURCE_ABBREVIATION
 				source$cdm_source_key <- gsub(" ", "_", source$cdm_source_abbreviation)
-				source$cdm_holder <- dataQualityResults$Metadata$CDM_HOLDER
-				source$source_description <- dataQualityResults$Metadata$SOURCE_DESCRIPTION
+				source$cdm_holder <- thisMetadata$CDM_HOLDER
+				source$source_description <- thisMetadata$SOURCE_DESCRIPTION
 
         source$releases <- rbind(
           source$releases,
           list(
-            release_name = format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y-%m-%d"),
-            release_id = format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y%m%d"),
-            cdm_version = dataQualityResults$Metadata$CDM_VERSION,
-            vocabulary_version = dataQualityResults$Metadata$VOCABULARY_VERSION,
-            dqd_version = dataQualityResults$Metadata$DQD_VERSION,
+            release_name = format(lubridate::ymd(thisMetadata$CDM_RELEASE_DATE),"%Y-%m-%d"),
+            release_id = format(lubridate::ymd(thisMetadata$CDM_RELEASE_DATE),"%Y%m%d"),
+            cdm_version = thisMetadata$CDM_VERSION,
+            vocabulary_version = thisMetadata$VOCABULARY_VERSION,
+            dqd_version = thisMetadata$DQD_VERSION,
             count_data_quality_issues = dataQualityResults$Overview$countOverallFailed,
             count_data_quality_checks = dataQualityResults$Overview$countTotal,
             dqd_execution_date = format(lubridate::ymd_hms(dataQualityResults$endTimestamp),"%Y-%m-%d"),

diff --git a/R/BuildNetworkPerformanceIndex.R b/R/BuildNetworkPerformanceIndex.R
@@ -40,17 +40,15 @@ buildNetworkPerformanceIndex <-
     analysisDetails <- dplyr::select(Achilles::getAnalysisDetails(), c("ANALYSIS_ID", "CATEGORY")) %>%
       rename(TASK = ANALYSIS_ID)
       releaseFolders <- list.dirs(sourceFolder, recursive = F)
-      if (length(releaseFolders) > 0) {
-        # iterate through release folders
-        for(releaseFolder in releaseFolders) {
+      latestRelease <- max(releaseFolders)
 
-            dataQualityResultsFile <- file.path(releaseFolder, "dq-result.json")
+            dataQualityResultsFile <- file.path(latestRelease, "dq-result.json")
             dataQualityResultsFileExists <- file.exists(dataQualityResultsFile)
             if (FALSE == dataQualityResultsFileExists) {
               writeLines(paste("missing data quality result file: ",dataQualityResultsFile))
             }
 
-            achillesPerformanceFile <- file.path(releaseFolder, "achilles-performance.csv")
+            achillesPerformanceFile <- file.path(latestRelease, "achilles-performance.csv")
             achillesPerformanceFileExists <- file.exists(achillesPerformanceFile)
             if (FALSE == achillesPerformanceFileExists) {
               writeLines(paste("missing achilles performance file: ",achillesPerformanceFile))
@@ -63,23 +61,25 @@ buildNetworkPerformanceIndex <-
               performanceData <- read.csv(achillesPerformanceFile)
 
               performanceTable <- dplyr::select(performanceData, c("analysis_id", "elapsed_seconds")) %>%
-                rename(TASK = analysis_id, TIMING = elapsed_seconds) %>% mutate(PACKAGE = "achilles")
+                rename(TASK = analysis_id, TIMING = elapsed_seconds) %>% mutate(PACKAGE = "ACHILLES")
 
               performanceTable <- merge(x=performanceTable,y=analysisDetails,by="TASK",all.x=TRUE)
 
-              dqdTable <- dplyr::select(dqdData, c("CheckResults.checkId", "CheckResults.EXECUTION_TIME", "CheckResults.CATEGORY")) %>%
-                rename(TASK = CheckResults.checkId, TIMING = CheckResults.EXECUTION_TIME, CATEGORY = CheckResults.CATEGORY) %>% mutate(PACKAGE = "dqd") %>%
-                mutate_at("TIMING", str_replace, " secs", "")
+              dqdTable <- dqdData %>%
+                dplyr::select(TASK = CheckResults.checkId,
+                              TIMING = CheckResults.executionTime,
+                              CATEGORY = CheckResults.category) %>%
+                dplyr::mutate(PACKAGE = "DQD") %>%
+                dplyr::mutate_at("TIMING", str_replace, " secs", "")
 
               mergedTable <- rbind(performanceTable, dqdTable)
 
               mergedTable <- mergedTable  %>%
-                mutate(SOURCE = basename(sourceFolder), RELEASE = basename(releaseFolder))
+                mutate(SOURCE = basename(sourceFolder))
 
               networkIndex <- rbind(networkIndex, mergedTable)
             }
-        }
-      }
+
 
     return(networkIndex)
   }
diff --git a/R/ComputeCharacterizationDifference.R b/R/ComputeCharacterizationDifference.R
@@ -0,0 +1,163 @@
+# @file ComputeCharacterizationDifference
+#
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of AresIndexer
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Generate characterization difference results.
+#'
+#' @name ComputeCharacterizationDifference
+#'
+#' @details Computes characterization difference reports for successive data sources releases of a data source.
+#' @param sourceFolder Path to source folder
+#'
+#' @return Table of difference results.
+#'
+#' @import jsonlite
+#' @import dplyr
+#' @import stringr
+#' @import data.table
+#'
+#' @export
+computeCharacterizationDifference <-
+  function(sourceFolders) {
+
+    for (sourceFolder in sourceFolders) {
+      releaseFolders <- list.dirs(sourceFolder, recursive = F)
+      releaseCount <- length(releaseFolders)
+      releaseFolders <- sort(releaseFolders, decreasing = F)
+
+      #add a loop to iterate through the releases
+      baseReleaseIndex <- 1
+      while (baseReleaseIndex + 1 <= releaseCount) {
+        baseReleaseFolder <- releaseFolders[baseReleaseIndex]
+        nextReleaseFolder <- releaseFolders[baseReleaseIndex + 1]
+
+        domains <-
+          c(
+            "drug_exposure",
+            "drug_era",
+            "condition_occurrence",
+            "condition_era",
+            "observation",
+            "measurement",
+            "device_exposure",
+            "procedure_occurrence",
+            "visit_occurrence",
+            "visit_detail"
+          )
+
+        # iterate through domains
+        for (domain in domains) {
+          writeLines(paste(
+            "Comparing",
+            baseReleaseFolder,
+            nextReleaseFolder,
+            domain
+          ))
+          domainFilename <-
+            paste0("domain-summary-", domain, ".csv")
+          baseDomainFile <-
+            file.path(baseReleaseFolder, domainFilename)
+          nextDomainFile <-
+            file.path(nextReleaseFolder, domainFilename)
+
+          if (file.exists(baseDomainFile) &&
+              file.exists(nextDomainFile)) {
+
+            #limit the columns we read to ensure we're performing clean calculations for diff values
+            baseData <- read.csv(baseDomainFile)[, c(
+              "CONCEPT_ID",
+              "CONCEPT_NAME",
+              "NUM_PERSONS",
+              "PERCENT_PERSONS",
+              "RECORDS_PER_PERSON",
+              "PERCENT_PERSONS_NTILE",
+              "RECORDS_PER_PERSON_NTILE"
+            )]
+
+            nextData <- read.csv(nextDomainFile)[, c(
+              "CONCEPT_ID",
+              "CONCEPT_NAME",
+              "NUM_PERSONS",
+              "PERCENT_PERSONS",
+              "RECORDS_PER_PERSON",
+              "PERCENT_PERSONS_NTILE",
+              "RECORDS_PER_PERSON_NTILE"
+            )]
+
+            fullData <-
+              dplyr::right_join(baseData,
+                                nextData,
+                                by = "CONCEPT_ID",
+                                na_matches = "never") %>%
+              mutate_if(is.numeric, coalesce, 0)
+
+            mutatedFullData <- fullData %>% mutate(
+              DIFF_NUM_PERSONS = NUM_PERSONS.y - NUM_PERSONS.x,
+              DIFF_PERCENT_PERSONS = format(
+                round(PERCENT_PERSONS.y - PERCENT_PERSONS.x, 4),
+                nsmall = 4
+              ),
+              DIFF_RECORDS_PER_PERSON = format(
+                round(RECORDS_PER_PERSON.y - RECORDS_PER_PERSON.x,
+                      1),
+                nsmall = 1
+              )
+            )
+
+            mutatedLimitedColumns <- mutatedFullData[, c(
+              "CONCEPT_ID",
+              "CONCEPT_NAME.y",
+              "NUM_PERSONS.y",
+              "PERCENT_PERSONS.y",
+              "RECORDS_PER_PERSON.y",
+              "PERCENT_PERSONS_NTILE.y",
+              "RECORDS_PER_PERSON_NTILE.y",
+              "DIFF_NUM_PERSONS",
+              "DIFF_PERCENT_PERSONS",
+              "DIFF_RECORDS_PER_PERSON"
+            )]
+
+            colnames(mutatedLimitedColumns) <-
+              c(
+                "CONCEPT_ID",
+                "CONCEPT_NAME",
+                "NUM_PERSONS",
+                "PERCENT_PERSONS",
+                "RECORDS_PER_PERSON",
+                "PERCENT_PERSONS_NTILE",
+                "RECORDS_PER_PERSON_NTILE",
+                "DIFF_NUM_PERSONS",
+                "DIFF_PERCENT_PERSONS",
+                "DIFF_RECORDS_PER_PERSON"
+              )
+
+            mutatedLimitedColumns$PERCENT_PERSONS <- format(round(mutatedLimitedColumns$PERCENT_PERSONS,4), nsmall=4)
+
+            # write out augmented summary file
+            differenceFilename <-
+              paste0("domain-summary-", domain, ".csv")
+            data.table::fwrite(mutatedLimitedColumns,
+                               file.path(nextReleaseFolder, differenceFilename))
+
+          } # end if check for domain file existence
+        } #end domain loop
+
+        baseReleaseIndex <- baseReleaseIndex + 1
+      } # end while loop
+    } # end source loop
+  }
diff --git a/man/ComputeCharacterizationDifference.Rd b/man/ComputeCharacterizationDifference.Rd