Skip to content

Commit

Permalink
0.7.0
Browse files Browse the repository at this point in the history
  • Loading branch information
fabkury committed Jan 14, 2023
1 parent b3ddcf5 commit 71b7a7e
Show file tree
Hide file tree
Showing 12 changed files with 861 additions and 698 deletions.
10 changes: 5 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Package: phea
Title: Phenotyping Algebra
Version: 0.6.4.0000
Version: 0.7.0.0000
Authors@R:
person("Fabrício", "Kury", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "YOUR-ORCID-ID"))
Description: Provides a formula-based framework for identifying patients in time-stamped data in SQL databases.
Description: SQL query builder (based on dbplyr) that creates queries that calculate formulas using patient data as variables.
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
Expand All @@ -16,9 +16,9 @@ Imports:
stringr,
purrr,
rlang,
tidyr,
plotly
tidyr
Suggests:
knitr,
rmarkdown
rmarkdown,
plotly
VignetteBuilder: knitr
193 changes: 69 additions & 124 deletions R/calculate_formula.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,6 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
# , pick = !(is.null(component$pick_by) || is.na(component$pick_by) || component$pick_by == '')
# , pick_by = component$pick_by)

# if(filtering_dates)
# res <- mutate(res, date_out = comp_name %in% dates_from)

return(res)
}) |>
dplyr::bind_rows()
Expand Down Expand Up @@ -234,7 +231,7 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU

# Add extra dates -------------------------------------------------------------------------------------------------
if(!is.null(dates)) {
message('Warning: dates is yet to be properly tested.')
message('Warning: `dates` is yet to be properly tested.')
dates_table <- dbplyr::copy_inline(.pheaglobalenv$con, dates)
board <- dplyr::union_all(board, dates_table)
}
Expand All @@ -245,115 +242,61 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
~rlang::exprs(!!..1 := !!dplyr::sql(..2))) |>
unique() |>
unlist(recursive = FALSE)
# The unique() above is just in case, but is it needed? Seems like the only way there could be duplicates is if the
# same component gets added twice to the call to calculate_formula().
# The unique() above is just in case, but is it needed? Seems like the only way there could be duplicates is if the
# same component gets added twice to the call to calculate_formula().

## Second, apply commands.
phea_row_id_sql_txt <- paste0('row_number() over (order by ', dbQuoteId('pid'), ', ', dbQuoteId('ts'), ')')

if(F) {
# if(any(var_map$pick)) {
# var_map_picks <- var_map[var_map$pick,]
#
# picks <- purrr::map(seq(nrow(var_map_picks)), \(i) {
# phea_col_name <- var_map_picks$composed_name[i]
# orig_col_name <- var_map_picks$column[i]
# component_name <- var_map_picks$component_name[i]
# rec_name <- var_map_picks$rec_name[i]
# pick_by <- var_map_picks$pick_by[i]
# sql_txt <- paste0('case when ',
# dbQuoteId('name'), ' = ', dbQuoteStr(rec_name), ' and ',
# dbQuoteId(pick_by), ' = ', dbQuoteId(paste0(component_name, '_', pick_by)),
# ' then ', dbQuoteId(orig_col_name), ' else NULL end')
# rlang::exprs(!!phea_col_name := !!dplyr::sql(sql_txt))
# }) |>
# unique() |>
# unlist(recursive = FALSE)
# # }
#
# # pick_sql <- list(
# # x = components,
# # y = names(components)) |>
# # purrr::pmap(\(x, y) {
# # if(x$pick) {
# # browser()
# # variables_to_hide <-
# # return(paste0('case when name != \'', x$rec_source$rec_name,
# # '\' then true else ', y, '_', x$pick_by, ' = ', x$pick_by, ' end'))
# # } else
# # return(NULL)
# # }) |>
# # purrr::discard(is.null) |>
# # paste0(collapse = ' and ')
#
# # if(pick_sql != '') {
# # Apply commands, filter, then drop unneded columns.
# board <- dplyr::mutate(board,
# phea_row_id = dplyr::sql(phea_row_id_sql_txt),
# !!!commands)
#
# browser()
# board <- filter(board, sql(pick_sql))
#
# commands_names <- names(commands)
# if(filtering_dates) {
# board <- dplyr::select(board,
# phea_row_id, pid, ts, phea_date_out, all_of(commands_names))
# } else {
# board <- dplyr::transmute(board,
# phea_row_id, pid, ts, all_of(commands_names))
# }
# } else {
}
# Make phea_row_id
prid <- dbplyr::win_over(con = .pheaglobalenv$con,
expr = dplyr::sql('row_number()'), order = c('pid', 'ts'))

# Apply commands to the board all at once, so we only generate a single layer of "SELECT ... FROM (SELECT ...)".
if(filtering_dates) {
board <- dplyr::transmute(board,
phea_row_id = dplyr::sql(phea_row_id_sql_txt),
phea_row_id = prid,
pid, ts, name,
!!!commands)
} else {
board <- dplyr::transmute(board,
phea_row_id = dplyr::sql(phea_row_id_sql_txt),
phea_row_id = prid,
pid, ts,
!!!commands)
}
# }

# browser()
## Third and final, fill the blanks downward with the last non-blank value, within the patient.
# board <- board |>
# dbplyr::window_order(pid, ts) |>
# dplyr::group_by(pid) |>
# tidyr::fill(!any_of(c('phea_row_id', 'pid', 'ts', 'name'))) |>
# ungroup()

# For some reason, apparently a bug in dbplyr's SQL translation, we need to "erase" an ORDER BY "pid", "ts" that is
# left over in the translated query. That ORDER BY persists even if you posteriorly do a dplyr::group_by() on the
# result of the phenotype (i.e. the board at this point). This causes the SQL server's query engine to raise an error,
# saying that "ts" must also be part of the GROUP BY. This left over ORDER BY "pid", "ts" apparently comes from the
# dbplyr::window_order() call that was necessary to guarantee the intended behavior of the call to
# tidyr::fill.lazy_tbl() above.
# board <- board |>
# arrange()

if(.pheaglobalenv$compatibility_mode) {
## Fill the blanks downward with the last non-blank value, within the patient.
board <- board |>
dbplyr::window_order(pid, ts) |>
dplyr::group_by(pid) |>
tidyr::fill(!any_of(c('phea_row_id', 'pid', 'ts', 'name'))) |>
ungroup()

# For some reason, apparently a bug in dbplyr's SQL translation, we need to "erase" an ORDER BY "pid", "ts" that is
# left over in the translated query. That ORDER BY persists even if you posteriorly do a dplyr::group_by() on the
# result of the phenotype (i.e. the board at this point). This causes the SQL server's query engine to raise an
# error, saying that "ts" must also be part of the GROUP BY. This left over ORDER BY "pid", "ts" apparently comes
# from the dbplyr::window_order() call that was necessary to guarantee the intended behavior of the call to
# tidyr::fill.lazy_tbl() above.
board <- board |>
arrange()
}

# dates_from ------------------------------------------------------------------------------------------------------
if(filtering_dates) {
# Obtain `rec_name`s from the record sources of the target components.
rec_names <- unique(var_map[var_map$component_name %in% dates_from,]$rec_name)

sql_txt <- paste0(dbQuoteId('name'), ' in (', paste0(dbQuoteStr(rec_names), collapse = ', '), ')')

sql_txt <- paste0(dbQuoteId('name'), ' in (', paste0(rec_names, collapse = ', '), ')')

# Keep only the rows coming from those `rec_name`s.
board <- board |>
filter(sql(sql_txt))
dplyr::filter(dplyr::sql(sql_txt))
}

# Compute window --------------------------------------------------------------------------------------------------
window_components <- setdiff(var_map$component_name, out_window)
if(!input_is_phenotype && length(window_components) > 1) { # Window only makes sense if there is > 1 component.
window_components_sql <- window_components |>
unique() |>
paste0('_ts') |>
window_components_sql <- paste0(unique(window_components), '_ts') |>
DBI::dbQuoteIdentifier(conn = .pheaglobalenv$con) |>
paste0(collapse = ', ')

Expand All @@ -367,18 +310,27 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
sql_ts_greatest <- sql_ts_least
}

# phea_ts_row is used to pick the best computation within each date. This is for the case when multiple data points
# exist on the same date. The best computation for each date is the last row within that date.
# The most complete computation is the last one in each timestamp. 'max(phea_row_id) over (partition by "pid", "ts")'
# finds the row with the largest (most complete) phea_row_id in each timestamp. last_value() could give the same
# result, and could be potentially faster (wild assumption) due to optimizations, but that's just an idea.
sql_txt <- paste0('MAX(', dbQuoteId('phea_row_id'), ') OVER (PARTITION BY ',
dbQuoteId('pid'), ', ', dbQuoteId('ts'), ')')

board <- board |>
dplyr::mutate(
window = dplyr::sql(sql_ts_greatest) - dplyr::sql(sql_ts_least),
phea_ts_row = dplyr::sql(sql_txt))
if(.pheaglobalenv$compatibility_mode) {
# phea_ts_row is used to pick the best computation within each date. This is for the case when multiple data points
# exist on the same date. The best computation for each date is the last row within that date.
# The most complete computation is the last one in each timestamp. 'max(phea_row_id) over (partition by "pid",
# "ts")' finds the row with the largest (most complete) phea_row_id in each timestamp. last_value() could give the
# same result, and could be potentially faster (wild assumption) due to optimizations, but that's just an idea.

# Make phea_ts_row
ptsr_txt <- paste0('max(', dbQuoteId('phea_row_id'), ')')

board <- board |>
dplyr::mutate(
window = dplyr::sql(sql_ts_greatest) - dplyr::sql(sql_ts_least),
phea_ts_row = dbplyr::win_over(con = .pheaglobalenv$con,
expr = dplyr::sql(ptsr_txt),
partition = c('pid', 'ts')))
} else {
board <- board |>
dplyr::mutate(
window = dplyr::sql(sql_ts_greatest) - dplyr::sql(sql_ts_least))
}

# Filter rows -----------------------------------------------------------------------------------------------------
# We also need to:
Expand All @@ -392,51 +344,45 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
# is FALSE.
required_components <- setdiff(names(components), dont_require)
if(length(required_components) > 0) {
sql_txt <- required_components |>
paste0('_ts') |>
sql_txt <- paste0(required_components, '_ts') |>
DBI::dbQuoteIdentifier(conn = .pheaglobalenv$con) |>
paste0(' is not null') |>
paste(collapse = ' and ')
paste0(' is not null', collapse = ' and ')

if(has_content(window)) {
board <- dplyr::filter(board,
phea_row_id == phea_ts_row &&
dplyr::sql(sql_txt) &&
window < local(window))
window < local(window) &&
dplyr::sql(sql_txt))
} else {
board <- dplyr::filter(board,
phea_row_id == phea_ts_row &&
dplyr::sql(sql_txt))
dplyr::sql(sql_txt))
}
} else {
# No required components after all, because all were excluded by dont_require. Let's just filter by the most
# complete computation.
if(has_content(window)) {
board <- dplyr::filter(board,
phea_row_id == phea_ts_row &&
window < local(window))
} else {
board <- dplyr::filter(board,
phea_row_id == phea_ts_row)
window < local(window))
}
}
} else {
# No need to require all components. Let's just filter by the most complete computation.
if(has_content(window)) { # This covers case if `window` is NULL
board <- board |>
dplyr::filter(phea_row_id == phea_ts_row &&
window < local(window))
} else {
board <- board |>
dplyr::filter(phea_row_id == phea_ts_row)
dplyr::filter(window < local(window))
}
}

if(.pheaglobalenv$compatibility_mode) {
# Keep only most complete computation
board <- dplyr::filter(board,
phea_row_id == phea_ts_row)
}

# Apply filters, if provided.
if(!is.null(filters) && any(!is.na(filters))) {
sql_txt <- paste0('(', paste0(filters[!is.na(filters)], collapse = ') AND ('), ')')
board <- board |>
filter(sql(sql_txt))
dplyr::filter(dplyr::sql(sql_txt))
}

# Limit number of output rows, if requested.
Expand All @@ -452,6 +398,7 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
# Calculate the formulas, if any.
res_vars <- NULL
if(!is.null(fml)) {
# is cascaded on?
if(cascaded) {
# Compute one at a time, so that the prior result can be used in the next formula.
for(i in seq(fml)) {
Expand Down Expand Up @@ -482,16 +429,14 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
# Get the name from the parent object, fml.
res_vars <- c(res_vars, names(fml)[i])

# The formula is the SQL.
# sql_txt <- cur_fml

# Apply to the board, producing a layer of SELECT ... FROM (SELECT ...).
# Apply to the board, producing a layer of SELECT ... FROM (SELECT ...). The formula is the SQL.
board <- dplyr::mutate(board,
!!rlang::sym(names(fml)[i]) := dplyr::sql(cur_fml))
}
}
} else {
# cascaded is turned off.

# Let's check if any of the formulas is itself a list, which means cascaded was supposed to be on.
if(any(lapply(fml, class) == 'list'))
stop('Nested formulas require cascaded = TRUE.')
Expand Down
Loading

0 comments on commit 71b7a7e

Please sign in to comment.