Skip to content

Commit

Permalink
Workaround for changed column names in DQD 2.1
Browse files Browse the repository at this point in the history
See issue OHDSI#30.  The fix is borrowed from
OHDSI#35.
  • Loading branch information
xitology committed Apr 26, 2023
1 parent 8882e02 commit 45dd051
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 16 deletions.
2 changes: 1 addition & 1 deletion R/AugmentConceptFiles.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ augmentConceptFiles <- function(releaseFolder) {
results <- dataQualityResults$CheckResults

# augment achilles concept files with data quality failure count for relevant concept checks
conceptAggregates <- results %>% filter(!is.na(results$CONCEPT_ID) && results$FAILED==1) %>% count(CONCEPT_ID,tolower(CDM_TABLE_NAME))
conceptAggregates <- results %>% filter(!is.na(results$conceptId) && results$failed==1) %>% count(conceptId,tolower(cdmTableName))
names(conceptAggregates) <- c("concept_id","cdm_table_name", "count_failed")
writeLines(paste0(nrow(conceptAggregates), " concept level data quality issues found."))
if (nrow(conceptAggregates) > 0) {
Expand Down
12 changes: 6 additions & 6 deletions R/BuildDataQualityHistoryIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,18 @@ buildDataQualityHistoryIndex <-
stratified_index <- data.table::data.table()

addResultsToIndex <- function(json) {
cdm_source_name <- json$Metadata[1,"CDM_SOURCE_NAME"]
cdm_source_abbreviation <- json$Metadata[1,"CDM_SOURCE_ABBREVIATION"]
vocabulary_version <- json$Metadata[1,"VOCABULARY_VERSION"]
cdm_release_date <- format(lubridate::ymd(json$Metadata[1,"CDM_RELEASE_DATE"]),"%Y-%m-%d")
cdm_source_name <- json$Metadata[1,"cdmSourceName"]
cdm_source_abbreviation <- json$Metadata[1,"cdmSourceAbbreviation"]
vocabulary_version <- json$Metadata[1,"vocabularyVersion"]
cdm_release_date <- format(lubridate::ymd(json$Metadata[1,"cdmReleaseDate"]),"%Y-%m-%d")
count_passed <- as.numeric(json$Overview$countPassed)
count_failed <- as.numeric(json$Overview$countOverallFailed)
count_total <- count_passed + count_failed
dqd_execution_date <- format(lubridate::ymd_hms(json$endTimestamp),"%Y-%m-%d")

stratifiedAggregates <- json$CheckResults %>%
filter(FAILED==1) %>%
group_by(CATEGORY, toupper(CDM_TABLE_NAME)) %>%
filter(failed==1) %>%
group_by(category, toupper(cdmTableName)) %>%
summarise(count_value=n())
names(stratifiedAggregates) <- c("category", "cdm_table_name", "count_value")
stratifiedAggregates$dqd_execution_date <- dqd_execution_date
Expand Down
23 changes: 16 additions & 7 deletions R/BuildDataQualityIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,23 +54,32 @@ buildDataQualityIndex <- function(sourceFolders, outputFolder) {
results <- dataQualityResults$CheckResults

# for each release, generate a summary of failures by cdm_table_name
domainAggregates <- results %>% filter(FAILED==1) %>% count(tolower(CDM_TABLE_NAME))
domainAggregates <- results %>% filter(failed==1) %>% count(tolower(cdmTableName))
names(domainAggregates) <- c("cdm_table_name", "count_failed")
data.table::fwrite(domainAggregates, file.path(releaseFolder,"domain-issues.csv"))

# collect all failures from this result file for network analysis
results$CHECK_NAME <- results$checkName
results$CHECK_LEVEL <- results$checkLevel
results$CDM_TABLE_NAME <- results$cmdTableName
results$CATEGORY <- results$category
results$SUBCATEGORY <- results$subcategory
results$CONTEXT <- results$context
results$CDM_FIELD_NAME <- results$cdmFieldName
results$CONCEPT_ID <- results$conceptId
results$UNIT_CONCEPT_ID <- results$unitConceptId
outColNames <- c("CHECK_NAME", "CHECK_LEVEL", "CDM_TABLE_NAME", "CATEGORY", "SUBCATEGORY", "CONTEXT", "CDM_FIELD_NAME", "CONCEPT_ID", "UNIT_CONCEPT_ID")
missingColNames <- setdiff(outColNames, names(results))
for (colName in missingColNames) {
writeLines(paste0("Expected column is missing in DQD results. Adding column with NA values: ", colName))
results[,colName] <- NA
}
sourceFailures <- results[results[,"FAILED"]==1,outColNames]
sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$CDM_SOURCE_NAME
sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION
sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$CDM_SOURCE_ABBREVIATION)
sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y-%m-%d")
sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$CDM_RELEASE_DATE),"%Y%m%d")
sourceFailures <- results[results[,"failed"]==1,outColNames]
sourceFailures$CDM_SOURCE_NAME <- dataQualityResults$Metadata$cdmSourceName
sourceFailures$CDM_SOURCE_ABBREVIATION <- dataQualityResults$Metadata$cdmSourceAbbreviation
sourceFailures$CDM_SOURCE_KEY <- gsub(" ","_",dataQualityResults$Metadata$cdmSourceAbbreviation)
sourceFailures$RELEASE_NAME <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y-%m-%d")
sourceFailures$RELEASE_ID <- format(lubridate::ymd(dataQualityResults$Metadata$cdmReleaseDate),"%Y%m%d")
networkIndex <- rbind(networkIndex, sourceFailures)
} else {
writeLines(paste("missing data quality result file ",dataQualityResultsFile))
Expand Down
4 changes: 2 additions & 2 deletions R/BuildNetworkPerformanceIndex.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ buildNetworkPerformanceIndex <-

performanceTable <- merge(x=performanceTable,y=analysisDetails,by="TASK",all.x=TRUE)

dqdTable <- dplyr::select(dqdData, c("CheckResults.checkId", "CheckResults.EXECUTION_TIME", "CheckResults.CATEGORY")) %>%
rename(TASK = CheckResults.checkId, TIMING = CheckResults.EXECUTION_TIME, CATEGORY = CheckResults.CATEGORY) %>% mutate(PACKAGE = "DQD") %>%
dqdTable <- dplyr::select(dqdData, c("CheckResults.checkId", "CheckResults.executionTime", "CheckResults.category")) %>%
rename(TASK = CheckResults.checkId, TIMING = CheckResults.executionTime, CATEGORY = CheckResults.category) %>% mutate(PACKAGE = "DQD") %>%
mutate_at("TIMING", str_replace, " secs", "")

mergedTable <- rbind(performanceTable, dqdTable)
Expand Down

0 comments on commit 45dd051

Please sign in to comment.