Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Be explicit about the datatypes of each column in csv files #68

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/R_CMD_check_Hades.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ jobs:

- name: Upload source package
if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: package_tarball
path: check/*.tar.gz
Expand Down
44 changes: 40 additions & 4 deletions R/EunomiaData.R
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ loadDataFiles <- function(dataPath,
stopifnot(dbms == "sqlite" || dbms == "duckdb")
stopifnot(is.character(dataPath), length(dataPath) == 1, nchar(dataPath) > 0)
stopifnot(is.character(dbPath), length(dbPath) == 1, nchar(dbPath) > 0)
stopifnot(cdmVersion == "5.3" || cdmVersion == "5.4")
stopifnot(isTRUE(verbose) || isFALSE(verbose))
stopifnot(isTRUE(overwrite) || isFALSE(overwrite))

dataFiles <- sort(list.files(path = dataPath, pattern = paste("*",inputFormat,sep=".")))
if (length(dataFiles) <= 0) {
Expand Down Expand Up @@ -191,26 +194,59 @@ loadDataFiles <- function(dataPath,
}
}

# Get the readr short notation for each column type. Use these when we read in csv data.
spec <- readr::read_csv(
system.file("csv", paste0("OMOP_CDMv", cdmVersion, "_Field_Level.csv"), package = "CommonDataModel"),
col_types = "ccc",
col_select = c("cdmTableName", "cdmFieldName", "cdmDatatype")
)

spec$readrTypes <- vapply(tolower(spec$cdmDatatype), switch, FUN.VALUE = character(1L),
"integer" = "i",
"date" = "D",
"datetime" = "T",
"float" = "d",
"c" # otherwise use character
)

for (i in 1:length(dataFiles)) {
dataFile <- dataFiles[i]
tableName <- tools::file_path_sans_ext(tolower(dataFile))

if (verbose) {
dataFileMessage <- paste("loading file: ", dataFile)
message(dataFileMessage, appendLF = TRUE)
}

# The GiBleed condition occurrence csv file has a column ordering that does
# not match the spec so for now we ignore types in this file
if (inputFormat == "csv") {
if (tableName %in% unique(spec$cdmTableName) && tableName != "condition_occurrence") {
# in the GiBleed dataset there is a cohort_attribute table which is not in the cdm spec csv file
# In the cases of tables not in the cdm spec we will use readr's guess for the R datatype
colTypes <- paste(spec[spec$cdmTableName == tableName,]$readrTypes, collapse = "")
} else {
colTypes <- NULL
}

tableData <- readr::read_csv(
file = file.path(dataPath, dataFiles[i]),
show_col_types = FALSE
show_col_types = FALSE,
col_types = colTypes
)

if (nrow(readr::problems(tableData)) > 0) {
message(paste("Probems with reading correct data types in csv file", dataFile))
print(readr::problems(tableData), n = 1e6)
}

} else if (inputFormat == "parquet") {
tableData <- arrow::read_parquet(
file = file.path(dataPath, dataFiles[i])
)
}

names(tableData) <- tolower(names(tableData))
tableName <- tools::file_path_sans_ext(tolower(dataFiles[i]))

if (dbms == "sqlite") {
for (j in seq_len(ncol(tableData))) {
Expand All @@ -225,10 +261,10 @@ loadDataFiles <- function(dataPath,
}

if (verbose) {
message("saving table: ",tableName," (rows: ", nrow(tableData), ")",appendLF = TRUE)
message("saving table: ", tableName," (rows: ", nrow(tableData), ")", appendLF = TRUE)
}

DBI::dbWriteTable(conn = connection, name = tableName, value = tableData, append=TRUE)
DBI::dbWriteTable(conn = connection, name = tableName, value = tableData, append = TRUE)
}
}

Expand Down
Loading