From be002d92eab4609e70bf2d76567dfa7007eec793 Mon Sep 17 00:00:00 2001 From: Adam Black Date: Wed, 18 Sep 2024 10:07:04 +0200 Subject: [PATCH 1/3] When reading in csv files explicity specify the datatypes of each column. --- R/EunomiaData.R | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/R/EunomiaData.R b/R/EunomiaData.R index 528b0e5..d6d5fb6 100644 --- a/R/EunomiaData.R +++ b/R/EunomiaData.R @@ -126,6 +126,9 @@ loadDataFiles <- function(dataPath, stopifnot(dbms == "sqlite" || dbms == "duckdb") stopifnot(is.character(dataPath), length(dataPath) == 1, nchar(dataPath) > 0) stopifnot(is.character(dbPath), length(dbPath) == 1, nchar(dbPath) > 0) + stopifnot(cdmVersion == "5.3" || cdmVersion == "5.4") + stopifnot(isTRUE(verbose) || isFALSE(verbose)) + stopifnot(isTRUE(overwrite) || isFALSE(overwrite)) dataFiles <- sort(list.files(path = dataPath, pattern = paste("*",inputFormat,sep="."))) if (length(dataFiles) <= 0) { @@ -191,18 +194,50 @@ loadDataFiles <- function(dataPath, } } + # Get the readr short notation for each column type. Use these when we read in csv data. + spec <- readr::read_csv( + system.file("csv", paste0("OMOP_CDMv", cdmVersion, "_Field_Level.csv"), package = "CommonDataModel"), + col_types = "ccc", + col_select = c("cdmTableName", "cdmFieldName", "cdmDatatype") + ) + + spec$readrTypes <- vapply(tolower(spec$cdmDatatype), switch, FUN.VALUE = character(1L), + "integer" = "i", + "date" = "D", + "datetime" = "T", + "float" = "d", + "c" # otherwise use character + ) + for (i in 1:length(dataFiles)) { dataFile <- dataFiles[i] + tableName <- tools::file_path_sans_ext(tolower(dataFile)) + if (verbose) { dataFileMessage <- paste("loading file: ", dataFile) message(dataFileMessage, appendLF = TRUE) } if (inputFormat == "csv") { + if (tableName %in% unique(spec$cdmTableName)) { + # in the GiBleed dataset there is a cohort_attribute table which is not in the cdm spec csv file + # In the cases of tables not in the cdm spec we will use readr's guess for the R datatype + colTypes <- paste(spec[spec$cdmTableName == tableName,]$readrTypes, collapse = "") + } else { + colTypes <- NULL + } + tableData <- readr::read_csv( file = file.path(dataPath, dataFiles[i]), - show_col_types = FALSE + show_col_types = FALSE, + col_types = colTypes ) + + if (nrow(readr::problems(tableData)) > 0) { + message(paste("Probems with reading correct data types in csv file", dataFile)) + print(readr::problems(tableData), n = 1e6) + } + } else if (inputFormat == "parquet") { tableData <- arrow::read_parquet( file = file.path(dataPath, dataFiles[i]) @@ -210,7 +245,6 @@ loadDataFiles <- function(dataPath, } names(tableData) <- tolower(names(tableData)) - tableName <- tools::file_path_sans_ext(tolower(dataFiles[i])) if (dbms == "sqlite") { for (j in seq_len(ncol(tableData))) { From 29dc6dd608f4af15d8298fb3aac1bac81275cfba Mon Sep 17 00:00:00 2001 From: Adam Black Date: Wed, 18 Sep 2024 10:20:19 +0200 Subject: [PATCH 2/3] add workaround for gibleed condition_occurrence csv column ordering --- R/EunomiaData.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/EunomiaData.R b/R/EunomiaData.R index d6d5fb6..cb8c1eb 100644 --- a/R/EunomiaData.R +++ b/R/EunomiaData.R @@ -218,8 +218,10 @@ loadDataFiles <- function(dataPath, message(dataFileMessage, appendLF = TRUE) } + # The GiBleed condition occurrence csv file has a column ordering that does + # not match the spec so for now we ignore types in this file if (inputFormat == "csv") { - if (tableName %in% unique(spec$cdmTableName)) { + if (tableName %in% unique(spec$cdmTableName) && tableName != "condition_occurrence") { # in the GiBleed dataset there is a cohort_attribute table which is not in the cdm spec csv file # In the cases of tables not in the cdm spec we will use readr's guess for the R datatype colTypes <- paste(spec[spec$cdmTableName == tableName,]$readrTypes, collapse = "") @@ -259,10 +261,10 @@ loadDataFiles <- function(dataPath, } if (verbose) { - message("saving table: ",tableName," (rows: ", nrow(tableData), ")",appendLF = TRUE) + message("saving table: ", tableName," (rows: ", nrow(tableData), ")", appendLF = TRUE) } - DBI::dbWriteTable(conn = connection, name = tableName, value = tableData, append=TRUE) + DBI::dbWriteTable(conn = connection, name = tableName, value = tableData, append = TRUE) } } From 8a7b6aedea93c43ed08516195665021907c37a00 Mon Sep 17 00:00:00 2001 From: Adam Black Date: Wed, 18 Sep 2024 10:29:03 +0200 Subject: [PATCH 3/3] update version of upload-artifact in github actions workflow --- .github/workflows/R_CMD_check_Hades.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml index e3ee30e..ca3184f 100644 --- a/.github/workflows/R_CMD_check_Hades.yaml +++ b/.github/workflows/R_CMD_check_Hades.yaml @@ -102,7 +102,7 @@ jobs: - name: Upload source package if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main' - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: package_tarball path: check/*.tar.gz