0.7.0

fabkury · Jan 14, 2023 · 71b7a7e · 71b7a7e
1 parent b3ddcf5
commit 71b7a7e
Show file tree

Hide file tree

Showing 12 changed files with 861 additions and 698 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,10 @@
 Package: phea
 Title: Phenotyping Algebra
-Version: 0.6.4.0000
+Version: 0.7.0.0000
 Authors@R: 
     person("Fabrício", "Kury", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "YOUR-ORCID-ID"))
-Description: Provides a formula-based framework for identifying patients in time-stamped data in SQL databases.
+Description: SQL query builder (based on dbplyr) that creates queries that calculate formulas using patient data as variables.
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
@@ -16,9 +16,9 @@ Imports:
 	stringr,
 	purrr,
 	rlang,
-	tidyr,
-	plotly
+	tidyr
 Suggests: 
     knitr,
-    rmarkdown
+    rmarkdown,
+    plotly
 VignetteBuilder: knitr
diff --git a/R/calculate_formula.R b/R/calculate_formula.R
@@ -123,9 +123,6 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
     # , pick = !(is.null(component$pick_by) || is.na(component$pick_by) || component$pick_by == '')
     # , pick_by = component$pick_by)
 
-    # if(filtering_dates)
-    #   res <- mutate(res, date_out = comp_name %in% dates_from)
-
     return(res)
   }) |>
     dplyr::bind_rows()
@@ -234,7 +231,7 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
 
   # Add extra dates -------------------------------------------------------------------------------------------------
   if(!is.null(dates)) {
-    message('Warning: dates is yet to be properly tested.')
+    message('Warning: `dates` is yet to be properly tested.')
     dates_table <- dbplyr::copy_inline(.pheaglobalenv$con, dates)
     board <- dplyr::union_all(board, dates_table)
   }
@@ -245,115 +242,61 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
     ~rlang::exprs(!!..1 := !!dplyr::sql(..2))) |>
     unique() |>
     unlist(recursive = FALSE)
-  # The unique() above is just in case, but is it needed? Seems like the only way there could be duplicates is if the
-  # same component gets added twice to the call to calculate_formula(). 
+    # The unique() above is just in case, but is it needed? Seems like the only way there could be duplicates is if the
+    # same component gets added twice to the call to calculate_formula(). 
 
   ## Second, apply commands.
-  phea_row_id_sql_txt <- paste0('row_number() over (order by ', dbQuoteId('pid'), ', ', dbQuoteId('ts'), ')')
-
-  if(F) {
-    # if(any(var_map$pick)) {
-    #   var_map_picks <- var_map[var_map$pick,]
-    #   
-    #   picks <- purrr::map(seq(nrow(var_map_picks)), \(i) {
-    #     phea_col_name <- var_map_picks$composed_name[i]
-    #     orig_col_name <- var_map_picks$column[i]
-    #     component_name <- var_map_picks$component_name[i]
-    #     rec_name <- var_map_picks$rec_name[i]
-    #     pick_by <- var_map_picks$pick_by[i]
-    #     sql_txt <- paste0('case when ',
-    #       dbQuoteId('name'), ' = ', dbQuoteStr(rec_name), ' and ',
-    #       dbQuoteId(pick_by), ' = ', dbQuoteId(paste0(component_name, '_', pick_by)),
-    #       ' then ', dbQuoteId(orig_col_name), ' else NULL end')
-    #     rlang::exprs(!!phea_col_name := !!dplyr::sql(sql_txt))
-    #   }) |>
-    #     unique() |>
-    #     unlist(recursive = FALSE)
-    # # }
-    # 
-    # # pick_sql <- list(
-    # #   x = components,
-    # #   y = names(components)) |>
-    # #   purrr::pmap(\(x, y) {
-    # #     if(x$pick) {
-    # #       browser()
-    # #       variables_to_hide <- 
-    # #       return(paste0('case when name != \'', x$rec_source$rec_name,
-    # #         '\' then true else ', y, '_', x$pick_by, ' = ', x$pick_by, ' end'))
-    # #     } else
-    # #       return(NULL)
-    # #   }) |>
-    # #   purrr::discard(is.null) |>
-    # #   paste0(collapse = ' and ')
-    # 
-    # # if(pick_sql != '') {
-    #   # Apply commands, filter, then drop unneded columns.
-    #   board <- dplyr::mutate(board,
-    #     phea_row_id = dplyr::sql(phea_row_id_sql_txt),
-    #     !!!commands)
-    #   
-    #   browser()
-    #   board <- filter(board, sql(pick_sql))
-    #   
-    #   commands_names <- names(commands)
-    #   if(filtering_dates) {
-    #     board <- dplyr::select(board,
-    #       phea_row_id, pid, ts, phea_date_out, all_of(commands_names))
-    #   } else {
-    #     board <- dplyr::transmute(board,
-    #       phea_row_id, pid, ts, all_of(commands_names))
-    #   }
-    # } else {
-  }
+  # Make phea_row_id
+  prid <- dbplyr::win_over(con = .pheaglobalenv$con,
+    expr = dplyr::sql('row_number()'), order = c('pid', 'ts'))
 
   # Apply commands to the board all at once, so we only generate a single layer of "SELECT ... FROM (SELECT ...)".
   if(filtering_dates) {
     board <- dplyr::transmute(board,
-      phea_row_id = dplyr::sql(phea_row_id_sql_txt),
+      phea_row_id = prid,
       pid, ts, name,
       !!!commands)
   } else {
     board <- dplyr::transmute(board,
-      phea_row_id = dplyr::sql(phea_row_id_sql_txt),
+      phea_row_id = prid,
       pid, ts,
       !!!commands)
   }
-  # }
-
-  # browser()
-  ## Third and final, fill the blanks downward with the last non-blank value, within the patient.
-  # board <- board |>
-  #   dbplyr::window_order(pid, ts) |>
-  #   dplyr::group_by(pid) |>
-  #   tidyr::fill(!any_of(c('phea_row_id', 'pid', 'ts', 'name'))) |>
-  #   ungroup()
-
-  # For some reason, apparently a bug in dbplyr's SQL translation, we need to "erase" an ORDER BY "pid", "ts" that is
-  # left over in the translated query. That ORDER BY persists even if you posteriorly do a dplyr::group_by() on the
-  # result of the phenotype (i.e. the board at this point). This causes the SQL server's query engine to raise an error,
-  # saying that "ts" must also be part of the GROUP BY. This left over ORDER BY "pid", "ts" apparently comes from the
-  # dbplyr::window_order() call that was necessary to guarantee the intended behavior of the call to
-  # tidyr::fill.lazy_tbl() above.
-  # board <- board |>
-  #   arrange()
+
+  if(.pheaglobalenv$compatibility_mode) {
+    ## Fill the blanks downward with the last non-blank value, within the patient.
+    board <- board |>
+      dbplyr::window_order(pid, ts) |>
+      dplyr::group_by(pid) |>
+      tidyr::fill(!any_of(c('phea_row_id', 'pid', 'ts', 'name'))) |>
+      ungroup()
+
+    # For some reason, apparently a bug in dbplyr's SQL translation, we need to "erase" an ORDER BY "pid", "ts" that is
+    # left over in the translated query. That ORDER BY persists even if you posteriorly do a dplyr::group_by() on the
+    # result of the phenotype (i.e. the board at this point). This causes the SQL server's query engine to raise an
+    # error, saying that "ts" must also be part of the GROUP BY. This left over ORDER BY "pid", "ts" apparently comes
+    # from the dbplyr::window_order() call that was necessary to guarantee the intended behavior of the call to
+    # tidyr::fill.lazy_tbl() above.
+    board <- board |>
+      arrange()
+  }
 
   # dates_from ------------------------------------------------------------------------------------------------------
   if(filtering_dates) {
     # Obtain `rec_name`s from the record sources of the target components.
     rec_names <- unique(var_map[var_map$component_name %in% dates_from,]$rec_name)
 
-    sql_txt <- paste0(dbQuoteId('name'), ' in (', paste0(dbQuoteStr(rec_names), collapse = ', '), ')')
-
+    sql_txt <- paste0(dbQuoteId('name'), ' in (', paste0(rec_names, collapse = ', '), ')')
+
+    # Keep only the rows coming from those `rec_name`s.    
     board <- board |>
-      filter(sql(sql_txt))
+      dplyr::filter(dplyr::sql(sql_txt))
   }
 
   # Compute window --------------------------------------------------------------------------------------------------
   window_components <- setdiff(var_map$component_name, out_window)
   if(!input_is_phenotype && length(window_components) > 1) { # Window only makes sense if there is > 1 component.
-    window_components_sql <- window_components |>
-      unique() |>
-      paste0('_ts') |>
+    window_components_sql <- paste0(unique(window_components), '_ts') |>
       DBI::dbQuoteIdentifier(conn = .pheaglobalenv$con) |>
       paste0(collapse = ', ')
 
@@ -367,18 +310,27 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
     sql_ts_greatest <- sql_ts_least
   }
 
-  # phea_ts_row is used to pick the best computation within each date. This is for the case when multiple data points
-  # exist on the same date. The best computation for each date is the last row within that date.
-  # The most complete computation is the last one in each timestamp. 'max(phea_row_id) over (partition by "pid", "ts")'
-  # finds the row with the largest (most complete) phea_row_id in each timestamp. last_value() could give the same
-  # result, and could be potentially faster (wild assumption) due to optimizations, but that's just an idea.
-  sql_txt <- paste0('MAX(', dbQuoteId('phea_row_id'), ') OVER (PARTITION BY ',
-    dbQuoteId('pid'), ', ', dbQuoteId('ts'), ')')
-
-  board <- board |>
-    dplyr::mutate(
-      window = dplyr::sql(sql_ts_greatest) - dplyr::sql(sql_ts_least),
-      phea_ts_row = dplyr::sql(sql_txt))
+  if(.pheaglobalenv$compatibility_mode) {
+    # phea_ts_row is used to pick the best computation within each date. This is for the case when multiple data points
+    # exist on the same date. The best computation for each date is the last row within that date.
+    # The most complete computation is the last one in each timestamp. 'max(phea_row_id) over (partition by "pid",
+    # "ts")' finds the row with the largest (most complete) phea_row_id in each timestamp. last_value() could give the
+    # same result, and could be potentially faster (wild assumption) due to optimizations, but that's just an idea.
+
+    # Make phea_ts_row
+    ptsr_txt <- paste0('max(', dbQuoteId('phea_row_id'), ')')
+
+    board <- board |>
+      dplyr::mutate(
+        window = dplyr::sql(sql_ts_greatest) - dplyr::sql(sql_ts_least),
+        phea_ts_row = dbplyr::win_over(con = .pheaglobalenv$con,
+          expr = dplyr::sql(ptsr_txt),
+          partition = c('pid', 'ts')))
+  } else {
+    board <- board |>
+      dplyr::mutate(
+        window = dplyr::sql(sql_ts_greatest) - dplyr::sql(sql_ts_least))
+  }
 
   # Filter rows -----------------------------------------------------------------------------------------------------
   # We also need to:
@@ -392,51 +344,45 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
     # is FALSE.
     required_components <- setdiff(names(components), dont_require)
     if(length(required_components) > 0) {
-      sql_txt <- required_components |>
-        paste0('_ts') |>
+      sql_txt <- paste0(required_components, '_ts') |>
         DBI::dbQuoteIdentifier(conn = .pheaglobalenv$con) |>
-        paste0(' is not null') |>
-        paste(collapse = ' and ')
+        paste0(' is not null', collapse = ' and ')
 
       if(has_content(window)) {
         board <- dplyr::filter(board,
-          phea_row_id == phea_ts_row &&
-            dplyr::sql(sql_txt) &&
-            window < local(window))
+          window < local(window) &&
+            dplyr::sql(sql_txt))
       } else {
         board <- dplyr::filter(board,
-          phea_row_id == phea_ts_row &&
-            dplyr::sql(sql_txt))
+          dplyr::sql(sql_txt))
       }
     } else {
       # No required components after all, because all were excluded by dont_require. Let's just filter by the most
       # complete computation.
       if(has_content(window)) {
         board <- dplyr::filter(board,
-          phea_row_id == phea_ts_row &&
-            window < local(window))
-      } else {
-        board <- dplyr::filter(board,
-          phea_row_id == phea_ts_row)
+          window < local(window))
       }
     }
   } else {
     # No need to require all components. Let's just filter by the most complete computation.
     if(has_content(window)) { # This covers case if `window` is NULL
       board <- board |>
-        dplyr::filter(phea_row_id == phea_ts_row &&
-            window < local(window))
-    } else {
-      board <- board |>
-        dplyr::filter(phea_row_id == phea_ts_row)
+        dplyr::filter(window < local(window))
     }
   }
 
+  if(.pheaglobalenv$compatibility_mode) {
+    # Keep only most complete computation
+    board <- dplyr::filter(board,
+      phea_row_id == phea_ts_row)
+  }
+
   # Apply filters, if provided.
   if(!is.null(filters) && any(!is.na(filters))) {
     sql_txt <- paste0('(', paste0(filters[!is.na(filters)], collapse = ') AND ('), ')')
     board <- board |>
-      filter(sql(sql_txt))
+      dplyr::filter(dplyr::sql(sql_txt))
   }
 
   # Limit number of output rows, if requested.
@@ -452,6 +398,7 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
   # Calculate the formulas, if any.
   res_vars <- NULL
   if(!is.null(fml)) {
+    # is cascaded on?
     if(cascaded) {
       # Compute one at a time, so that the prior result can be used in the next formula.
       for(i in seq(fml)) {
@@ -482,16 +429,14 @@ calculate_formula <- function(components, fml = NULL, window = NULL, export = NU
           # Get the name from the parent object, fml. 
           res_vars <- c(res_vars, names(fml)[i])
 
-          # The formula is the SQL.
-          # sql_txt <- cur_fml
-
-          # Apply to the board, producing a layer of SELECT ... FROM (SELECT ...).
+          # Apply to the board, producing a layer of SELECT ... FROM (SELECT ...). The formula is the SQL.
           board <- dplyr::mutate(board,
             !!rlang::sym(names(fml)[i]) := dplyr::sql(cur_fml))
         }
       }
     } else {
       # cascaded is turned off.
+
       # Let's check if any of the formulas is itself a list, which means cascaded was supposed to be on.
       if(any(lapply(fml, class) == 'list'))
         stop('Nested formulas require cascaded = TRUE.')