From be002d92eab4609e70bf2d76567dfa7007eec793 Mon Sep 17 00:00:00 2001
From: Adam Black <ablack3@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:07:04 +0200
Subject: [PATCH 1/3] When reading in csv files explicity specify the datatypes
 of each column.

---
 R/EunomiaData.R | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/R/EunomiaData.R b/R/EunomiaData.R
index 528b0e5..d6d5fb6 100644
--- a/R/EunomiaData.R
+++ b/R/EunomiaData.R
@@ -126,6 +126,9 @@ loadDataFiles <- function(dataPath,
   stopifnot(dbms == "sqlite" || dbms == "duckdb")
   stopifnot(is.character(dataPath), length(dataPath) == 1, nchar(dataPath) > 0)
   stopifnot(is.character(dbPath), length(dbPath) == 1, nchar(dbPath) > 0)
+  stopifnot(cdmVersion == "5.3" || cdmVersion == "5.4")
+  stopifnot(isTRUE(verbose) || isFALSE(verbose))
+  stopifnot(isTRUE(overwrite) || isFALSE(overwrite))
 
   dataFiles <- sort(list.files(path = dataPath, pattern = paste("*",inputFormat,sep=".")))
   if (length(dataFiles) <= 0) {
@@ -191,18 +194,50 @@ loadDataFiles <- function(dataPath,
     }
   }
 
+  # Get the readr short notation for each column type. Use these when we read in csv data.
+  spec <- readr::read_csv(
+    system.file("csv", paste0("OMOP_CDMv", cdmVersion, "_Field_Level.csv"), package = "CommonDataModel"),
+    col_types = "ccc",
+    col_select = c("cdmTableName", "cdmFieldName", "cdmDatatype")
+  )
+
+  spec$readrTypes <- vapply(tolower(spec$cdmDatatype), switch, FUN.VALUE = character(1L),
+    "integer" = "i",
+    "date" = "D",
+    "datetime" = "T",
+    "float" = "d",
+    "c" # otherwise use character
+  )
+
   for (i in 1:length(dataFiles)) {
     dataFile <- dataFiles[i]
+    tableName <- tools::file_path_sans_ext(tolower(dataFile))
+
     if (verbose) {
       dataFileMessage <- paste("loading file: ", dataFile)
       message(dataFileMessage, appendLF = TRUE)
     }
 
     if (inputFormat == "csv") {
+      if (tableName %in% unique(spec$cdmTableName)) {
+        # in the GiBleed dataset there is a cohort_attribute table which is not in the cdm spec csv file
+        # In the cases of tables not in the cdm spec we will use readr's guess for the R datatype
+        colTypes <- paste(spec[spec$cdmTableName == tableName,]$readrTypes, collapse = "")
+      } else {
+        colTypes <- NULL
+      }
+
       tableData <- readr::read_csv(
         file = file.path(dataPath, dataFiles[i]),
-        show_col_types = FALSE
+        show_col_types = FALSE,
+        col_types = colTypes
       )
+
+      if (nrow(readr::problems(tableData)) > 0) {
+        message(paste("Probems with reading correct data types in csv file", dataFile))
+        print(readr::problems(tableData), n = 1e6)
+      }
+
     } else if (inputFormat == "parquet") {
       tableData <- arrow::read_parquet(
         file = file.path(dataPath, dataFiles[i])
@@ -210,7 +245,6 @@ loadDataFiles <- function(dataPath,
     }
 
     names(tableData) <- tolower(names(tableData))
-    tableName <- tools::file_path_sans_ext(tolower(dataFiles[i]))
 
     if (dbms == "sqlite") {
       for (j in seq_len(ncol(tableData))) {

From 29dc6dd608f4af15d8298fb3aac1bac81275cfba Mon Sep 17 00:00:00 2001
From: Adam Black <ablack3@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:20:19 +0200
Subject: [PATCH 2/3] add workaround for gibleed condition_occurrence csv
 column ordering

---
 R/EunomiaData.R | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/R/EunomiaData.R b/R/EunomiaData.R
index d6d5fb6..cb8c1eb 100644
--- a/R/EunomiaData.R
+++ b/R/EunomiaData.R
@@ -218,8 +218,10 @@ loadDataFiles <- function(dataPath,
       message(dataFileMessage, appendLF = TRUE)
     }
 
+    # The GiBleed condition occurrence csv file has a column ordering that does
+    # not match the spec so for now we ignore types in this file
     if (inputFormat == "csv") {
-      if (tableName %in% unique(spec$cdmTableName)) {
+      if (tableName %in% unique(spec$cdmTableName) && tableName != "condition_occurrence") {
         # in the GiBleed dataset there is a cohort_attribute table which is not in the cdm spec csv file
         # In the cases of tables not in the cdm spec we will use readr's guess for the R datatype
         colTypes <- paste(spec[spec$cdmTableName == tableName,]$readrTypes, collapse = "")
@@ -259,10 +261,10 @@ loadDataFiles <- function(dataPath,
     }
 
     if (verbose) {
-      message("saving table: ",tableName," (rows: ", nrow(tableData), ")",appendLF = TRUE)
+      message("saving table: ", tableName," (rows: ", nrow(tableData), ")", appendLF = TRUE)
     }
 
-    DBI::dbWriteTable(conn = connection, name = tableName, value = tableData, append=TRUE)
+    DBI::dbWriteTable(conn = connection, name = tableName, value = tableData, append = TRUE)
   }
 }
 

From 8a7b6aedea93c43ed08516195665021907c37a00 Mon Sep 17 00:00:00 2001
From: Adam Black <ablack3@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:29:03 +0200
Subject: [PATCH 3/3] update version of upload-artifact in github actions
 workflow

---
 .github/workflows/R_CMD_check_Hades.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml
index e3ee30e..ca3184f 100644
--- a/.github/workflows/R_CMD_check_Hades.yaml
+++ b/.github/workflows/R_CMD_check_Hades.yaml
@@ -102,7 +102,7 @@ jobs:
 
       - name: Upload source package
         if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4
         with:
           name: package_tarball
           path: check/*.tar.gz