prepare release (#439)

* Update DESCRIPTION * only resrtict to first if many observations per subjectId * fix assignment operator in configurePython (#421) * Tibble dependancy removal (#422) * remove unnecessary remotes (#423) * Study population improvements (#424) * assign population if existing and added a test (#428) * 429 save cdm database name (#430) * save dev database name and schema in trainDetails (#434) * preserve attributes when splitting data * Prevent plpData from being evaluated during do.call (#436) * test improvements (#438) * fix duplicate vignette titles --------- Co-authored-by: jreps <[email protected]> Co-authored-by: Henrik <[email protected]>
OHDSI · Apr 5, 2024 · 9e37b51 · 9e37b51
1 parent 7a55c31
commit 9e37b51
Show file tree

Hide file tree

Showing 53 changed files with 373 additions and 333 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -10,5 +10,4 @@ compare_versions
 .github
 docs/*
 _pkgdown.yml
-
-
+^vignettes/articles$
diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml
@@ -87,13 +87,13 @@ jobs:
           path: check/*.tar.gz
 
       - name: Install covr
-        if: runner.os == 'Windows'
+        if: runner.os == 'Linux'
         run: |
           remotes::install_cran("covr")
         shell: Rscript {0}
-
+        
       - name: Test coverage
-        if: runner.os == 'Windows'
+        if: runner.os == 'Linux'
         run: covr::codecov()
         shell: Rscript {0}
 

diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,8 @@
 *-Ex.R
 # R data files from past sessions
 .Rdata
+# R environ
+.Renviron
 # RStudio files
 .Rproj.user/
 .Rproj.user
@@ -20,4 +22,4 @@ standalone/build/*
 /plpmodels/*
 /python_models/*
 /mycache/*
-/inst/shiny/DiagnosticsExplorer/rsconnect/*
+/inst/shiny/DiagnosticsExplorer/rsconnect/*
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,8 +2,8 @@ Package: PatientLevelPrediction
 Type: Package
 Title: Developing patient level prediction using data in the OMOP Common Data
     Model
-Version: 6.3.6
-Date: 2023-10-09
+Version: 6.3.7
+Date: 2024-04-04
 Authors@R: c(
     person("Jenna", "Reps", email = "[email protected]", role = c("aut", "cre")),
     person("Martijn", "Schuemie", role = c("aut")),
@@ -43,7 +43,6 @@ Imports:
     rlang,
     SqlRender (>= 1.1.3),
     survival,
-    tibble,
     tidyr,
     utils
 Suggests:
@@ -74,9 +73,7 @@ Remotes:
     ohdsi/BigKnn,
     ohdsi/Eunomia,
     ohdsi/FeatureExtraction,
-    ohdsi/IterativeHardThresholding,
-    ohdsi/ParallelLogger,
     ohdsi/ShinyAppBuilder,
-    ohdsi/ResultModelManager
-RoxygenNote: 7.2.3
+    ohdsi/ResultModelManager,
+RoxygenNote: 7.3.1
 Encoding: UTF-8
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,17 @@
+PatientLevelPrediction 6.3.7
+======================
+- Clean up dependencies, tibble removed and IHT and ParallelLogger from CRAN
+- Use cohortIds for cohortCovariates to comply with FeatureExtraction
+- Add cdmDatabaseName from DatabaseDetails to model output
+- Fix bug when attributes weren't preserved on trainData$covariateData after split
+- Fix warnings in tests and speed them up
+- Fix bug in assignment operator in configurePython
+- Delay evaluation of plpData when using do.call like in learningCurves and 
+runMultiplePlp
+- Speed up population generation when subjectId's are distinct
+- Fix bug when population was still generated when provided to runPlp
+
+
 PatientLevelPrediction 6.3.6
 ======================
 - fix bug with ohdsi shiny modules version check (issue 415)

diff --git a/R/AdditionalCovariates.R b/R/AdditionalCovariates.R
@@ -30,7 +30,7 @@
 #' @param cohortTable  the table name that contains the target population cohort
 #' @param rowIdField  string representing the unique identifier in the target population cohort
 #' @param aggregated  whether the covariate should be aggregated
-#' @param cohortId  cohort id for the target cohort
+#' @param cohortIds  cohort id for the target cohort
 #' @param covariateSettings  settings for the covariate cohorts and time periods
 #'
 #' @return
@@ -45,7 +45,7 @@ getCohortCovariateData <- function(
   cohortTable = "#cohort_person",
   rowIdField = "row_id",
   aggregated,
-  cohortId,
+  cohortIds,
   covariateSettings
   ){
 
@@ -69,7 +69,7 @@ getCohortCovariateData <- function(
     sql,
     covariate_cohort_schema = covariateSettings$cohortDatabaseSchema,
     covariate_cohort_table = covariateSettings$cohortTable,
-    covariate_cohort_id = covariateSettings$cohortId,
+    covariate_cohort_id = covariateSettings$cohortIds,
     cohort_temp_table = cohortTable,
     row_id_field = rowIdField,
     startDay = covariateSettings$startDay,
@@ -191,7 +191,7 @@ createCohortCovariateSettings <- function(
     covariateId = cohortId*100000+settingId*1000+analysisId,
     cohortDatabaseSchema = cohortDatabaseSchema,
     cohortTable = cohortTable,
-    cohortId = cohortId,
+    cohortIds = cohortId,
     startDay = startDay,
     endDays = endDay,
     count = count,

diff --git a/R/AndromedaHelperFunctions.R b/R/AndromedaHelperFunctions.R
@@ -97,7 +97,7 @@ calculatePrevs <- function(plpData, population){
   #===========================
 
   # add population to sqllite
-  population <- tibble::as_tibble(population)
+  population <- dplyr::as_tibble(population)
   plpData$covariateData$population <- population %>% 
     dplyr::select("rowId", "outcomeCount")
 

diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R
@@ -211,7 +211,8 @@ fitCyclopsModel <- function(
     trainDetails = list(
       analysisId = analysisId, 
       analysisSource = '', #TODO add from model
-      developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema,
+      developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName,
+      developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, 
       attrition = attr(trainData, "metaData")$attrition, 
       trainingTime =  paste(as.character(abs(comp)), attr(comp,'units')),
       trainingDate = Sys.Date(),

diff --git a/R/DataSplitting.R b/R/DataSplitting.R
@@ -148,6 +148,7 @@ splitData <- function(plpData = plpData,
       outcomeId = attr(population, "metaData")$outcomeId,
       targetId = attr(population, "metaData")$targetId,
       cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema,
+      cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName,
       cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId,
       restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings,
       covariateSettings = plpData$metaData$covariateSettings,
@@ -186,6 +187,7 @@ splitData <- function(plpData = plpData,
       outcomeId = attr(population, "metaData")$outcomeId,
       targetId = attr(population, "metaData")$targetId,
       cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema,
+      cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName,
       cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId,
       restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings,
       covariateSettings = plpData$metaData$covariateSettings,
@@ -195,10 +197,6 @@ splitData <- function(plpData = plpData,
       populationSize = nrow(trainData$labels)
       )
 
-    # add pop size to covariateData as used in tidyCovariates
-    attr(trainData$covariateData, "metaData") <- list(populationSize = nrow(trainData$labels))
-    class(trainData$covariateData) <- "CovariateData"
-
     testId <- splitId[splitId$index<0,]
     testData <- list()
     class(testData) <- 'plpData'
@@ -214,7 +212,6 @@ splitData <- function(plpData = plpData,
         data.frame(rowId = testId$rowId), 
         sizeN = 10000000)
     }
-    class(testData$covariateData) <- "CovariateData"
 
     result <- list(
       Train =  trainData,

diff --git a/R/DemographicSummary.R b/R/DemographicSummary.R
@@ -111,30 +111,30 @@ getDemographicSummary_survival <- function(prediction, evalColumn, timepoint = N
         tempDemo <- demographicSum %>% 
           dplyr::filter( .data$genGroup == gen & .data$ageGroup == age )
 
-        if(nrow(tempDemo)>0){
-          t1 <- tempDemo %>% dplyr::select("t")
-          y1 <- tempDemo %>% dplyr::select("y")
-          p1 <- tempDemo %>% dplyr::select("value")
+        if (nrow(tempDemo) > 1 & length(unique(tempDemo$y)) > 1) {
+          t <- tempDemo$t
+          y <- tempDemo$y
+          value <- tempDemo$value
 
           out <- tryCatch(
             {
               summary(
-                survival::survfit(survival::Surv(t1$t, y1$y) ~ 1), 
+                survival::survfit(survival::Surv(t, y) ~ 1), 
                 times = timepoint
               )
             },
-            error = function(e){ParallelLogger::logError(e); return(NULL)}
+            error = function(e){ParallelLogger::logError(e);return(NULL)}
           )
 
           if(!is.null(out)){
             demoTemp <- c(
               genGroup = gen, 
               ageGroup = age, 
-              PersonCountAtRisk = length(p1$value),
-              PersonCountWithOutcome = round(length(p1$value)*(1-out$surv)),
+              PersonCountAtRisk = length(value),
+              PersonCountWithOutcome = round(length(value)*(1-out$surv)),
               observedRisk = 1-out$surv, 
-              averagePredictedProbability = mean(p1$value, na.rm = T),
-              StDevPredictedProbability = stats::sd(p1$value, na.rm = T)
+              averagePredictedProbability = mean(value, na.rm = T),
+              StDevPredictedProbability = stats::sd(value, na.rm = T)
               )
 
             demographicData <- rbind(demographicData, demoTemp)

diff --git a/R/EvaluationSummary.R b/R/EvaluationSummary.R
@@ -35,7 +35,7 @@ getEvaluationStatistics_binary <- function(prediction, evalColumn, ...){
 
     # auc
     ParallelLogger::logInfo(paste0('Calculating Performance for ', evalType))
-    ParallelLogger::logInfo('=============')
+   ParallelLogger::logInfo('=============')
 
     ParallelLogger::logTrace('Calculating AUC')
     auc <- computeAuc(predictionOfInterest, confidenceInterval = T)

diff --git a/R/FeatureImportance.R b/R/FeatureImportance.R
@@ -105,37 +105,36 @@ pfi <- function(plpResult, population, plpData, repeats = 1,
     ParallelLogger::logInfo(paste0('Using all ', cores))
     ParallelLogger::logInfo(paste0('Set cores input to use fewer...'))
   }
-
+  getVpiSettings <- function(i) {
+    result <- list(plpModel = plpResult$model, 
+                   population = population, 
+                   plpDataLocation = plpDataLocation,
+                   covariateId = covariates[i],
+                   repeats = repeats)
+    return(result)
+  }
+  if (cores > 1) {
   cluster <- ParallelLogger::makeCluster(numberOfThreads = cores)
   ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda"))
 
-
-  getVpiSettings <- function(i){
-    result <-list(plpModel = plpResult$model, 
-                  population = population, 
-                  plpDataLocation = plpDataLocation,
-                  covariateId = covariates[i],
-                  repeats = repeats)
-    return(result)
-  }
   vpiSettings <- lapply(1:length(covariates), getVpiSettings)
 
-
-  #lapply(vpiSettings, function(x) do.call(permutePerf, x))
   aucP <- ParallelLogger::clusterApply(cluster = cluster, 
                                                 x = vpiSettings, 
                                                 fun = permutePerf, 
                                                 stopOnError = FALSE,
                                                 progressBar = TRUE)
   ParallelLogger::stopCluster(cluster)
 
+  } else {
+    ParallelLogger::logInfo("Running in serial")
+    aucP <- lapply(1:length(covariates), function(i) {
+      permutePerf(getVpiSettings(i))
+    })
+  }
   aucP <- do.call(c, aucP)
-
-  # do this in parellel
-
   varImp <- data.frame(covariateId = covariates,
-                       pfi = auc-aucP)
-
+                       pfi = auc - aucP)
   return(varImp)
 
 }
@@ -200,7 +199,7 @@ permute <- function(plpDataLocation,cId,population){
 
     # find a new random selection of people and give them the covariate and value
     newPlp <- sample(population$rowId,nSamp)
-    newData <- tibble::as_tibble(cbind(rowId = newPlp,coi[,-1]))
+    newData <- dplyr::as_tibble(cbind(rowId = newPlp,coi[,-1]))
 
     # swap old covariate data with new
     plpData$covariateData$covariates <- plpData$covariateData$covariates %>% dplyr::filter(.data$covariateId != !!cId) %>% dplyr::collect()
@@ -215,7 +214,7 @@ permute <- function(plpDataLocation,cId,population){
 
     # sample the pop to replace 
     swapPlp <- sample(population$rowId,nSamp)
-    haveCidDataSwapped <- tibble::as_tibble(cbind(rowId = swapPlp,haveCidData[,-1]))
+    haveCidDataSwapped <- dplyr::as_tibble(cbind(rowId = swapPlp,haveCidData[,-1]))
 
     # find the swapped people to switch 
     connectedCovs <- plpData$covariateData$covariateRef %>% 
@@ -228,7 +227,7 @@ permute <- function(plpDataLocation,cId,population){
       dplyr::filter(.data$rowId %in% swapPlp) %>% 
       dplyr::collect()
 
-    swappedForCid <- tibble::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)],plpToSwap[,-1]))
+    swappedForCid <- dplyr::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)],plpToSwap[,-1]))
 
 
     # swap old covariate data with new

diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
@@ -1,6 +1,8 @@
 # fix issue with nrow - temp fix for me locally
 nrow <- function(x){UseMethod("nrow",x)}
+#' @exportS3Method NULL
 nrow.default <- base::nrow
+#' @exportS3Method NULL
 nrow.tbl <- function(x){x %>% dplyr::tally() %>% dplyr::pull()}
 
 
@@ -101,9 +103,9 @@ configurePython <- function(envname='PLP', envtype=NULL, condaPythonVersion="3.1
 
   if(is.null(envtype)){
     if(getOs()=='windows'){
-      envtype=='conda'
+      envtype <- "conda"
     } else {
-      envtype=='python'
+      envtype <- "python"
     }
   }
 

diff --git a/R/KNN.R b/R/KNN.R
@@ -145,7 +145,8 @@ fitKNN <- function(trainData, modelSettings, search = 'none', analysisId, ...){
 
     trainDetails = list(
       analysisId = analysisId,
-      developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema,
+      developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName,
+      developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, 
       attrition = attr(trainData, "metaData")$attrition, 
       trainingTime = paste(as.character(abs(comp)), attr(comp,'units')),
       trainingDate = Sys.Date(),

diff --git a/R/LearningCurve.R b/R/LearningCurve.R
@@ -188,7 +188,7 @@ createLearningCurve <- function(
     nRuns <- length(trainFractions)
 
     settings = list(
-      plpData = plpData,
+      plpData = quote(plpData),
       outcomeId = outcomeId,
       analysisId = analysisId,
       populationSettings = populationSettings,
@@ -238,7 +238,7 @@ createLearningCurve <- function(
 
 lcWrapper <- function(settings){
   plpData <- PatientLevelPrediction::loadPlpData(settings$plpData)
-  settings$plpData <- plpData
+  settings$plpData <- quote(plpData)
   result <- tryCatch({do.call(runPlp, settings)},
                      warning = function(war) {
                        ParallelLogger::logInfo(paste0('a warning: ', war))
@@ -470,8 +470,8 @@ plotLearningCurve <- function(learningCurve,
 
   # create plot object
   plot <- tidyLearningCurve %>%
-    ggplot2::ggplot(ggplot2::aes_string(x = abscissa, y= 'value',
-      col = "Dataset")) +
+    ggplot2::ggplot(ggplot2::aes(x = .data[[abscissa]], y = .data[['value']],
+      col = .data[["Dataset"]])) +
     ggplot2::geom_line() +
     ggplot2::coord_cartesian(ylim = yAxisRange, expand = FALSE) +
     ggplot2::labs(title = plotTitle, subtitle = plotSubtitle, 

diff --git a/R/PatientLevelPrediction.R b/R/PatientLevelPrediction.R
@@ -20,12 +20,11 @@
 #' 
 #' @description A package for running predictions using data in the OMOP CDM
 #'
-#' @docType package
 #' @name PatientLevelPrediction
 #' @keywords internal
 #' @importFrom dplyr %>%
 #' @importFrom rlang .data
-NULL
+"_PACKAGE"
 
 #' A simulation profile
 #' @docType data
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,5 +10,4 @@ compare_versions @@
     .github
     docs/*
     _pkgdown.yml
+    ^vignettes/articles$