diff --git a/.Rbuildignore b/.Rbuildignore index 3fbf93c69..4fac5d027 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -10,5 +10,4 @@ compare_versions .github docs/* _pkgdown.yml - - +^vignettes/articles$ diff --git a/.github/workflows/R_CMD_check_Hades.yaml b/.github/workflows/R_CMD_check_Hades.yaml index f7c4ea17e..8f940f131 100644 --- a/.github/workflows/R_CMD_check_Hades.yaml +++ b/.github/workflows/R_CMD_check_Hades.yaml @@ -87,13 +87,13 @@ jobs: path: check/*.tar.gz - name: Install covr - if: runner.os == 'Windows' + if: runner.os == 'Linux' run: | remotes::install_cran("covr") shell: Rscript {0} - + - name: Test coverage - if: runner.os == 'Windows' + if: runner.os == 'Linux' run: covr::codecov() shell: Rscript {0} diff --git a/.gitignore b/.gitignore index c7ce48620..41469fa63 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ *-Ex.R # R data files from past sessions .Rdata +# R environ +.Renviron # RStudio files .Rproj.user/ .Rproj.user @@ -20,4 +22,4 @@ standalone/build/* /plpmodels/* /python_models/* /mycache/* -/inst/shiny/DiagnosticsExplorer/rsconnect/* \ No newline at end of file +/inst/shiny/DiagnosticsExplorer/rsconnect/* diff --git a/DESCRIPTION b/DESCRIPTION index dde325a53..951f8e056 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: PatientLevelPrediction Type: Package Title: Developing patient level prediction using data in the OMOP Common Data Model -Version: 6.3.6 -Date: 2023-10-09 +Version: 6.3.7 +Date: 2024-04-04 Authors@R: c( person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut", "cre")), person("Martijn", "Schuemie", role = c("aut")), @@ -43,7 +43,6 @@ Imports: rlang, SqlRender (>= 1.1.3), survival, - tibble, tidyr, utils Suggests: @@ -74,9 +73,7 @@ Remotes: ohdsi/BigKnn, ohdsi/Eunomia, ohdsi/FeatureExtraction, - ohdsi/IterativeHardThresholding, - ohdsi/ParallelLogger, ohdsi/ShinyAppBuilder, - ohdsi/ResultModelManager -RoxygenNote: 7.2.3 + ohdsi/ResultModelManager, +RoxygenNote: 7.3.1 Encoding: UTF-8 diff --git a/NEWS.md b/NEWS.md index ea37b5ad1..f81eb0a3d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +PatientLevelPrediction 6.3.7 +====================== +- Clean up dependencies, tibble removed and IHT and ParallelLogger from CRAN +- Use cohortIds for cohortCovariates to comply with FeatureExtraction +- Add cdmDatabaseName from DatabaseDetails to model output +- Fix bug when attributes weren't preserved on trainData$covariateData after split +- Fix warnings in tests and speed them up +- Fix bug in assignment operator in configurePython +- Delay evaluation of plpData when using do.call like in learningCurves and +runMultiplePlp +- Speed up population generation when subjectId's are distinct +- Fix bug when population was still generated when provided to runPlp + + PatientLevelPrediction 6.3.6 ====================== - fix bug with ohdsi shiny modules version check (issue 415) diff --git a/R/AdditionalCovariates.R b/R/AdditionalCovariates.R index 790fe9df4..77204d1de 100644 --- a/R/AdditionalCovariates.R +++ b/R/AdditionalCovariates.R @@ -30,7 +30,7 @@ #' @param cohortTable the table name that contains the target population cohort #' @param rowIdField string representing the unique identifier in the target population cohort #' @param aggregated whether the covariate should be aggregated -#' @param cohortId cohort id for the target cohort +#' @param cohortIds cohort id for the target cohort #' @param covariateSettings settings for the covariate cohorts and time periods #' #' @return @@ -45,7 +45,7 @@ getCohortCovariateData <- function( cohortTable = "#cohort_person", rowIdField = "row_id", aggregated, - cohortId, + cohortIds, covariateSettings ){ @@ -69,7 +69,7 @@ getCohortCovariateData <- function( sql, covariate_cohort_schema = covariateSettings$cohortDatabaseSchema, covariate_cohort_table = covariateSettings$cohortTable, - covariate_cohort_id = covariateSettings$cohortId, + covariate_cohort_id = covariateSettings$cohortIds, cohort_temp_table = cohortTable, row_id_field = rowIdField, startDay = covariateSettings$startDay, @@ -191,7 +191,7 @@ createCohortCovariateSettings <- function( covariateId = cohortId*100000+settingId*1000+analysisId, cohortDatabaseSchema = cohortDatabaseSchema, cohortTable = cohortTable, - cohortId = cohortId, + cohortIds = cohortId, startDay = startDay, endDays = endDay, count = count, diff --git a/R/AndromedaHelperFunctions.R b/R/AndromedaHelperFunctions.R index bf70bdb06..b6182610c 100644 --- a/R/AndromedaHelperFunctions.R +++ b/R/AndromedaHelperFunctions.R @@ -97,7 +97,7 @@ calculatePrevs <- function(plpData, population){ #=========================== # add population to sqllite - population <- tibble::as_tibble(population) + population <- dplyr::as_tibble(population) plpData$covariateData$population <- population %>% dplyr::select("rowId", "outcomeCount") diff --git a/R/CyclopsModels.R b/R/CyclopsModels.R index 83c61f098..16f035164 100644 --- a/R/CyclopsModels.R +++ b/R/CyclopsModels.R @@ -211,7 +211,8 @@ fitCyclopsModel <- function( trainDetails = list( analysisId = analysisId, analysisSource = '', #TODO add from model - developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), diff --git a/R/DataSplitting.R b/R/DataSplitting.R index 35519801d..195e2680c 100644 --- a/R/DataSplitting.R +++ b/R/DataSplitting.R @@ -148,6 +148,7 @@ splitData <- function(plpData = plpData, outcomeId = attr(population, "metaData")$outcomeId, targetId = attr(population, "metaData")$targetId, cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema, + cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName, cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId, restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings, covariateSettings = plpData$metaData$covariateSettings, @@ -186,6 +187,7 @@ splitData <- function(plpData = plpData, outcomeId = attr(population, "metaData")$outcomeId, targetId = attr(population, "metaData")$targetId, cdmDatabaseSchema = plpData$metaData$databaseDetails$cdmDatabaseSchema, + cdmDatabaseName = plpData$metaData$databaseDetails$cdmDatabaseName, cdmDatabaseId = plpData$metaData$databaseDetails$cdmDatabaseId, restrictPlpDataSettings = attr(population, "metaData")$restrictPlpDataSettings, covariateSettings = plpData$metaData$covariateSettings, @@ -195,10 +197,6 @@ splitData <- function(plpData = plpData, populationSize = nrow(trainData$labels) ) - # add pop size to covariateData as used in tidyCovariates - attr(trainData$covariateData, "metaData") <- list(populationSize = nrow(trainData$labels)) - class(trainData$covariateData) <- "CovariateData" - testId <- splitId[splitId$index<0,] testData <- list() class(testData) <- 'plpData' @@ -214,7 +212,6 @@ splitData <- function(plpData = plpData, data.frame(rowId = testId$rowId), sizeN = 10000000) } - class(testData$covariateData) <- "CovariateData" result <- list( Train = trainData, diff --git a/R/DemographicSummary.R b/R/DemographicSummary.R index 7157690a7..b3df14305 100644 --- a/R/DemographicSummary.R +++ b/R/DemographicSummary.R @@ -111,30 +111,30 @@ getDemographicSummary_survival <- function(prediction, evalColumn, timepoint = N tempDemo <- demographicSum %>% dplyr::filter( .data$genGroup == gen & .data$ageGroup == age ) - if(nrow(tempDemo)>0){ - t1 <- tempDemo %>% dplyr::select("t") - y1 <- tempDemo %>% dplyr::select("y") - p1 <- tempDemo %>% dplyr::select("value") + if (nrow(tempDemo) > 1 & length(unique(tempDemo$y)) > 1) { + t <- tempDemo$t + y <- tempDemo$y + value <- tempDemo$value out <- tryCatch( { summary( - survival::survfit(survival::Surv(t1$t, y1$y) ~ 1), + survival::survfit(survival::Surv(t, y) ~ 1), times = timepoint ) }, - error = function(e){ParallelLogger::logError(e); return(NULL)} + error = function(e){ParallelLogger::logError(e);return(NULL)} ) if(!is.null(out)){ demoTemp <- c( genGroup = gen, ageGroup = age, - PersonCountAtRisk = length(p1$value), - PersonCountWithOutcome = round(length(p1$value)*(1-out$surv)), + PersonCountAtRisk = length(value), + PersonCountWithOutcome = round(length(value)*(1-out$surv)), observedRisk = 1-out$surv, - averagePredictedProbability = mean(p1$value, na.rm = T), - StDevPredictedProbability = stats::sd(p1$value, na.rm = T) + averagePredictedProbability = mean(value, na.rm = T), + StDevPredictedProbability = stats::sd(value, na.rm = T) ) demographicData <- rbind(demographicData, demoTemp) diff --git a/R/EvaluationSummary.R b/R/EvaluationSummary.R index 4e02d7cf4..a318cb923 100644 --- a/R/EvaluationSummary.R +++ b/R/EvaluationSummary.R @@ -35,7 +35,7 @@ getEvaluationStatistics_binary <- function(prediction, evalColumn, ...){ # auc ParallelLogger::logInfo(paste0('Calculating Performance for ', evalType)) - ParallelLogger::logInfo('=============') + ParallelLogger::logInfo('=============') ParallelLogger::logTrace('Calculating AUC') auc <- computeAuc(predictionOfInterest, confidenceInterval = T) diff --git a/R/FeatureImportance.R b/R/FeatureImportance.R index f47f4de25..6c73b1223 100644 --- a/R/FeatureImportance.R +++ b/R/FeatureImportance.R @@ -105,23 +105,20 @@ pfi <- function(plpResult, population, plpData, repeats = 1, ParallelLogger::logInfo(paste0('Using all ', cores)) ParallelLogger::logInfo(paste0('Set cores input to use fewer...')) } - + getVpiSettings <- function(i) { + result <- list(plpModel = plpResult$model, + population = population, + plpDataLocation = plpDataLocation, + covariateId = covariates[i], + repeats = repeats) + return(result) + } + if (cores > 1) { cluster <- ParallelLogger::makeCluster(numberOfThreads = cores) ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda")) - - getVpiSettings <- function(i){ - result <-list(plpModel = plpResult$model, - population = population, - plpDataLocation = plpDataLocation, - covariateId = covariates[i], - repeats = repeats) - return(result) - } vpiSettings <- lapply(1:length(covariates), getVpiSettings) - - #lapply(vpiSettings, function(x) do.call(permutePerf, x)) aucP <- ParallelLogger::clusterApply(cluster = cluster, x = vpiSettings, fun = permutePerf, @@ -129,13 +126,15 @@ pfi <- function(plpResult, population, plpData, repeats = 1, progressBar = TRUE) ParallelLogger::stopCluster(cluster) + } else { + ParallelLogger::logInfo("Running in serial") + aucP <- lapply(1:length(covariates), function(i) { + permutePerf(getVpiSettings(i)) + }) + } aucP <- do.call(c, aucP) - - # do this in parellel - varImp <- data.frame(covariateId = covariates, - pfi = auc-aucP) - + pfi = auc - aucP) return(varImp) } @@ -200,7 +199,7 @@ permute <- function(plpDataLocation,cId,population){ # find a new random selection of people and give them the covariate and value newPlp <- sample(population$rowId,nSamp) - newData <- tibble::as_tibble(cbind(rowId = newPlp,coi[,-1])) + newData <- dplyr::as_tibble(cbind(rowId = newPlp,coi[,-1])) # swap old covariate data with new plpData$covariateData$covariates <- plpData$covariateData$covariates %>% dplyr::filter(.data$covariateId != !!cId) %>% dplyr::collect() @@ -215,7 +214,7 @@ permute <- function(plpDataLocation,cId,population){ # sample the pop to replace swapPlp <- sample(population$rowId,nSamp) - haveCidDataSwapped <- tibble::as_tibble(cbind(rowId = swapPlp,haveCidData[,-1])) + haveCidDataSwapped <- dplyr::as_tibble(cbind(rowId = swapPlp,haveCidData[,-1])) # find the swapped people to switch connectedCovs <- plpData$covariateData$covariateRef %>% @@ -228,7 +227,7 @@ permute <- function(plpDataLocation,cId,population){ dplyr::filter(.data$rowId %in% swapPlp) %>% dplyr::collect() - swappedForCid <- tibble::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)],plpToSwap[,-1])) + swappedForCid <- dplyr::as_tibble(cbind(rowId = haveCidData$rowId[1:nrow(plpToSwap)],plpToSwap[,-1])) # swap old covariate data with new diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index bf5f71b1b..c6fd07d13 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -1,6 +1,8 @@ # fix issue with nrow - temp fix for me locally nrow <- function(x){UseMethod("nrow",x)} +#' @exportS3Method NULL nrow.default <- base::nrow +#' @exportS3Method NULL nrow.tbl <- function(x){x %>% dplyr::tally() %>% dplyr::pull()} @@ -101,9 +103,9 @@ configurePython <- function(envname='PLP', envtype=NULL, condaPythonVersion="3.1 if(is.null(envtype)){ if(getOs()=='windows'){ - envtype=='conda' + envtype <- "conda" } else { - envtype=='python' + envtype <- "python" } } diff --git a/R/KNN.R b/R/KNN.R index 54ef1528d..834b26eaa 100644 --- a/R/KNN.R +++ b/R/KNN.R @@ -145,7 +145,8 @@ fitKNN <- function(trainData, modelSettings, search = 'none', analysisId, ...){ trainDetails = list( analysisId = analysisId, - developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), diff --git a/R/LearningCurve.R b/R/LearningCurve.R index 21c89d5d3..9b3ea9498 100644 --- a/R/LearningCurve.R +++ b/R/LearningCurve.R @@ -188,7 +188,7 @@ createLearningCurve <- function( nRuns <- length(trainFractions) settings = list( - plpData = plpData, + plpData = quote(plpData), outcomeId = outcomeId, analysisId = analysisId, populationSettings = populationSettings, @@ -238,7 +238,7 @@ createLearningCurve <- function( lcWrapper <- function(settings){ plpData <- PatientLevelPrediction::loadPlpData(settings$plpData) - settings$plpData <- plpData + settings$plpData <- quote(plpData) result <- tryCatch({do.call(runPlp, settings)}, warning = function(war) { ParallelLogger::logInfo(paste0('a warning: ', war)) @@ -470,8 +470,8 @@ plotLearningCurve <- function(learningCurve, # create plot object plot <- tidyLearningCurve %>% - ggplot2::ggplot(ggplot2::aes_string(x = abscissa, y= 'value', - col = "Dataset")) + + ggplot2::ggplot(ggplot2::aes(x = .data[[abscissa]], y = .data[['value']], + col = .data[["Dataset"]])) + ggplot2::geom_line() + ggplot2::coord_cartesian(ylim = yAxisRange, expand = FALSE) + ggplot2::labs(title = plotTitle, subtitle = plotSubtitle, diff --git a/R/PatientLevelPrediction.R b/R/PatientLevelPrediction.R index b4e2a82b5..848c42705 100644 --- a/R/PatientLevelPrediction.R +++ b/R/PatientLevelPrediction.R @@ -20,12 +20,11 @@ #' #' @description A package for running predictions using data in the OMOP CDM #' -#' @docType package #' @name PatientLevelPrediction #' @keywords internal #' @importFrom dplyr %>% #' @importFrom rlang .data -NULL +"_PACKAGE" #' A simulation profile #' @docType data diff --git a/R/Plotting.R b/R/Plotting.R index cfa5026d8..f2fe6bf0d 100644 --- a/R/Plotting.R +++ b/R/Plotting.R @@ -293,7 +293,7 @@ plotSparseRoc <- function( plots <- list() length(plots) <- length(evalTypes) - for(i in 1:length(evalTypes)){ + for (i in 1:length(evalTypes)){ evalType <- evalTypes[i] x <- plpResult$performanceEvaluation$thresholdSummary %>% dplyr::filter(.data[[typeColumn]] == evalType) %>% @@ -316,14 +316,14 @@ plotSparseRoc <- function( ) ) + ggplot2::geom_polygon(fill = "blue", alpha = 0.2) + - ggplot2::geom_line(size=1) + + ggplot2::geom_line(linewidth = 1) + ggplot2::geom_abline(intercept = 0, slope = 1,linetype = 2) + - ggplot2::scale_x_continuous("1 - specificity", limits=c(0,1)) + - ggplot2::scale_y_continuous("Sensitivity", limits=c(0,1)) + + ggplot2::scale_x_continuous("1 - specificity", limits = c(0,1)) + + ggplot2::scale_y_continuous("Sensitivity", limits = c(0,1)) + ggplot2::ggtitle(evalType) } - plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1) + plot <- gridExtra::marrangeGrob(plots, nrow =length(plots), ncol = 1) if (!is.null(saveLocation)){ if(!dir.exists(saveLocation)){ @@ -567,11 +567,11 @@ plotPrecisionRecall <- function( dplyr::select("positivePredictiveValue", "sensitivity") plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(.data$sensitivity, .data$positivePredictiveValue)) + - ggplot2::geom_line(size=1) + + ggplot2::geom_line(linewidth=1) + ggplot2::scale_x_continuous("Recall")+#, limits=c(0,1)) + ggplot2::scale_y_continuous("Precision") + #, limits=c(0,1)) ggplot2::geom_hline(yintercept = inc, linetype="dashed", - color = "red", size=1) + + color = "red", linewidth = 1) + ggplot2::ggtitle(evalType) } @@ -630,8 +630,8 @@ plotF1Measure <- function( } plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(.data$predictionThreshold, .data$f1Score)) + - ggplot2::geom_line(size=1) + - ggplot2::geom_point(size=1) + + ggplot2::geom_line(linewidth = 1) + + ggplot2::geom_point(size = 1) + ggplot2::scale_x_continuous("predictionThreshold")+#, limits=c(0,1)) + ggplot2::scale_y_continuous("F1Score") +#, limits=c(0,1)) ggplot2::ggtitle(evalType) @@ -844,9 +844,9 @@ plotSparseCalibration <- function( )) + ggplot2::geom_ribbon(ggplot2::aes(ymin=.data$lci,ymax=.data$uci, x=x), fill="blue", alpha=0.2) + - ggplot2::geom_point(size=1, color='darkblue') + + ggplot2::geom_point(size = 1, color='darkblue') + ggplot2::coord_cartesian(ylim = c(0, maxVal), xlim =c(0,maxVal)) + - ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2, size=1, + ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 2, linewidth = 1, show.legend = TRUE) + ggplot2::geom_abline(intercept = res['Intercept'], slope = res['Gradient'], linetype = 1,show.legend = TRUE, @@ -919,7 +919,7 @@ plotSparseCalibration2 <- function( ggplot2::geom_point(size=2, color='black') + ggplot2::geom_errorbar(limits) + ggplot2::geom_line(colour='darkgrey') + - ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 5, size=0.4, + ggplot2::geom_abline(intercept = 0, slope = 1, linetype = 5, linewidth=0.4, show.legend = TRUE) + ggplot2::scale_x_continuous("Average Predicted Probability") + ggplot2::scale_y_continuous("Observed Fraction With Outcome") + @@ -1248,15 +1248,14 @@ plotSmoothCalibrationLoess <- function(data, span = 0.75) { fill = "blue", alpha = 0.2 ) + - ggplot2::geom_segment( - ggplot2::aes( - x = 0, - xend = 1, - y = 0, - yend = 1, - color = "Ideal", - linetype = "Ideal" - ) + ggplot2::annotate( + geom = "segment", + x = 0, + xend = 1, + y = 0, + yend = 1, + color = "red", + linetype = "dashed" ) + ggplot2::scale_linetype_manual( name = "Models", @@ -1442,43 +1441,44 @@ plotPredictionDistribution <- function( plots <- list() length(plots) <- length(evalTypes) - for(i in 1:length(evalTypes)){ + for (i in 1:length(evalTypes)) { evalType <- evalTypes[i] x <- plpResult$performanceEvaluation$predictionDistribution %>% dplyr::filter(.data[[typeColumn]] == evalType) - non05 <- x$P05PredictedProbability[x$class==0] - non95 <- x$P95PredictedProbability[x$class==0] - one05 <- x$P05PredictedProbability[x$class==1] - one95 <- x$P95PredictedProbability[x$class==1] - - plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(x=as.factor(.data$class), - ymin=.data$MinPredictedProbability, - lower=.data$P25PredictedProbability, - middle=.data$MedianPredictedProbability, - upper=.data$P75PredictedProbability, - ymax=.data$MaxPredictedProbability, - color=as.factor(.data$class))) + + non05 <- x$P05PredictedProbability[x$class == 0] + non95 <- x$P95PredictedProbability[x$class == 0] + one05 <- x$P05PredictedProbability[x$class == 1] + one95 <- x$P95PredictedProbability[x$class == 1] + + plots[[i]] <- ggplot2::ggplot(x, + ggplot2::aes(x = as.factor(class), + ymin = .data$MinPredictedProbability, + lower = .data$P25PredictedProbability, + middle = .data$MedianPredictedProbability, + upper = .data$P75PredictedProbability, + ymax = .data$MaxPredictedProbability, + color = as.factor(.data$class))) + ggplot2::coord_flip() + - ggplot2::geom_boxplot(stat="identity") + + ggplot2::geom_boxplot(stat = "identity") + ggplot2::scale_x_discrete("Class") + ggplot2::scale_y_continuous("Predicted Probability") + - ggplot2::theme(legend.position="none") + - ggplot2::geom_segment(ggplot2::aes(x = 0.9, y = non05, - xend = 1.1, yend = non05), color='red') + - ggplot2::geom_segment(ggplot2::aes(x = 0.9, y = non95, - xend = 1.1, yend = non95), color='red') + - ggplot2::geom_segment(ggplot2::aes(x = 1.9, y = one05, - xend = 2.1, yend = one05)) + - ggplot2::geom_segment(ggplot2::aes(x = 1.9, y = one95, - xend = 2.1, yend = one95)) + - ggplot2::ggtitle(evalType) + ggplot2::theme(legend.position = "none") + + ggplot2::annotate("segment", x = 0.9, xend = 1.1, y = non05, yend = non05, + color = "red") + + ggplot2::annotate("segment", x = 0.9, xend = 1.1, y = non95, yend = non95, + color = "red") + + ggplot2::annotate("segment", x = 1.9, xend = 2.1, y = one05, yend = one05, + color = "#00BFC4") + + ggplot2::annotate("segment", x = 1.9, xend = 2.1, y = one95, yend = one95, + color = "#00BFC4") + + ggplot2::ggtitle(evalType) } - plot <- gridExtra::marrangeGrob(plots, nrow=length(plots), ncol=1) + plot <- gridExtra::marrangeGrob(plots, nrow = length(plots), ncol = 1) - if (!is.null(saveLocation)){ - if(!dir.exists(saveLocation)){ + if (!is.null(saveLocation)) { + if (!dir.exists(saveLocation)) { dir.create(saveLocation, recursive = T) } ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400) diff --git a/R/PopulationSettings.R b/R/PopulationSettings.R index e35df3b67..098a784f7 100644 --- a/R/PopulationSettings.R +++ b/R/PopulationSettings.R @@ -215,6 +215,9 @@ createStudyPopulation <- function( if (is.null(population)) { population <- plpData$cohorts + } else { + population <- plpData$cohorts %>% + dplyr::filter(.data$rowId %in% (population %>% dplyr::pull(.data$rowId))) } # save the metadata (should have the ?targetId, outcomeId, plpDataSettings and population settings) @@ -259,9 +262,9 @@ createStudyPopulation <- function( # get the outcomes during TAR - outcomeTAR <- population %>% - dplyr::inner_join(plpData$outcomes, by ='rowId') %>% - dplyr::filter(.data$outcomeId == get('oId')) %>% + outcomeTAR <- plpData$outcomes %>% + dplyr::filter(.data$outcomeId == get("oId")) %>% + dplyr::inner_join(population, by = "rowId") %>% dplyr::select("rowId", "daysToEvent", "tarStart", "tarEnd") %>% dplyr::filter(.data$daysToEvent >= .data$tarStart & .data$daysToEvent <= .data$tarEnd) @@ -294,10 +297,12 @@ createStudyPopulation <- function( if (firstExposureOnly) { ParallelLogger::logTrace(paste("Restricting to first exposure")) - population <- population %>% - dplyr::arrange(.data$subjectId,.data$cohortStartDate) %>% - dplyr::group_by(.data$subjectId) %>% - dplyr::filter(dplyr::row_number(.data$subjectId)==1) + if (nrow(population) > dplyr::n_distinct(population$subjectId)) { + population <- population %>% + dplyr::arrange(.data$subjectId,.data$cohortStartDate) %>% + dplyr::group_by(.data$subjectId) %>% + dplyr::filter(dplyr::row_number(.data$subjectId)==1) + } attrRow <- population %>% dplyr::group_by() %>% dplyr::summarise(outcomeId = get('oId'), @@ -329,9 +334,9 @@ createStudyPopulation <- function( ParallelLogger::logTrace("Removing subjects with prior outcomes (if any)") # get the outcomes during TAR - outcomeBefore <- population %>% - dplyr::inner_join(plpData$outcomes, by ='rowId') %>% + outcomeBefore <- plpData$outcomes %>% dplyr::filter(outcomeId == get('oId')) %>% + dplyr::inner_join(population, by = 'rowId') %>% dplyr::select("rowId", "daysToEvent", "tarStart") %>% dplyr::filter(.data$daysToEvent < .data$tarStart & .data$daysToEvent > -get('priorOutcomeLookback') ) diff --git a/R/Predict.R b/R/Predict.R index e84d48eec..14eff8787 100644 --- a/R/Predict.R +++ b/R/Predict.R @@ -168,7 +168,7 @@ applyTidyCovariateData <- function( if(!is.null(maxs)){ if('bins'%in%colnames(maxs)){ - covariateData$maxes <- tibble::as_tibble(maxs) %>% dplyr::rename(covariateId = "bins") %>% + covariateData$maxes <- dplyr::as_tibble(maxs) %>% dplyr::rename(covariateId = "bins") %>% dplyr::rename(maxValue = "maxs") } else{ covariateData$maxes <- maxs diff --git a/R/RClassifier.R b/R/RClassifier.R index 69b575193..a74e2358b 100644 --- a/R/RClassifier.R +++ b/R/RClassifier.R @@ -83,7 +83,8 @@ fitRclassifier <- function( trainDetails = list( analysisId = analysisId, analysisSource = '', #TODO add from model - developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), diff --git a/R/RunMultiplePlp.R b/R/RunMultiplePlp.R index eff802409..9e0e6e5aa 100644 --- a/R/RunMultiplePlp.R +++ b/R/RunMultiplePlp.R @@ -175,7 +175,7 @@ runMultiplePlp <- function( if(!analysisExists){ plpData <- PatientLevelPrediction::loadPlpData(file.path(saveDirectory, settings$dataLocation)) runPlpSettings <- list( - plpData = plpData, + plpData = quote(plpData), outcomeId = modelDesign$outcomeId, analysisId = settings$analysisId, populationSettings = modelDesign$populationSettings, diff --git a/R/RunPlp.R b/R/RunPlp.R index a711101e6..3d94c4725 100644 --- a/R/RunPlp.R +++ b/R/RunPlp.R @@ -268,21 +268,23 @@ runPlp <- function( }) # create the population - population <- tryCatch( - { - do.call( - createStudyPopulation, - list( - plpData = plpData, - outcomeId = outcomeId, - populationSettings = populationSettings, - population = plpData$population - ) - ) - }, + if(!is.null(plpData$population)) { + ParallelLogger::logInfo('Using existing population') + population <- plpData$population + } else { + ParallelLogger::logInfo('Creating population') + population <- tryCatch({ + do.call(createStudyPopulation, + list(plpData = plpData, + outcomeId = outcomeId, + populationSettings = populationSettings, + population = plpData$population + ) + )}, error = function(e){ParallelLogger::logError(e); return(NULL)} - ) - + ) + } + if(is.null(population)){ stop('population NULL') } diff --git a/R/Simulation.R b/R/Simulation.R index ae9c8e34a..b1d115c14 100644 --- a/R/Simulation.R +++ b/R/Simulation.R @@ -162,7 +162,8 @@ simulatePlpData <- function(plpDataSimulationProfile, n = 10000) { metaData = list() metaData$databaseDetails <- list( - cdmDatabaseSchema = 'Profile', + cdmDatabaseSchema = 'CDM_SCHEMA', + cdmDatabaseName = "CDM_NAME", outcomeDatabaseSchema = NULL, cohortDatabaseSchema = NULL, connectionDetails = NULL, diff --git a/R/SklearnClassifier.R b/R/SklearnClassifier.R index 4e386b705..8d4a2603c 100644 --- a/R/SklearnClassifier.R +++ b/R/SklearnClassifier.R @@ -115,7 +115,8 @@ fitSklearn <- function( trainDetails = list( analysisId = analysisId, analysisSource = '', #TODO add from model - developmentDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, + developmentDatabase = attr(trainData, "metaData")$cdmDatabaseName, + developmentDatabaseSchema = attr(trainData, "metaData")$cdmDatabaseSchema, attrition = attr(trainData, "metaData")$attrition, trainingTime = paste(as.character(abs(comp)), attr(comp,'units')), trainingDate = Sys.Date(), diff --git a/R/ThresholdSummary.R b/R/ThresholdSummary.R index b35b0e152..b36d86014 100644 --- a/R/ThresholdSummary.R +++ b/R/ThresholdSummary.R @@ -205,17 +205,19 @@ getThresholdSummary_survival <- function(prediction, evalColumn, timepoint, ...) ) nbSummary <- tryCatch( - { + { xstart <- max(min(preddat$p),0.001); + xstop <- min(max(preddat$p),0.99); stdca( data = preddat, outcome = "y", ttoutcome = "t", timepoint = timepoint, predictors = "p", - xstart = max(min(preddat$p),0.001), #0.001, - xstop = min(max(preddat$p),0.99), - xby = 0.001, - smooth=F + xstart = xstart, + xstop = xstop, + xby = (xstop - xstart)/100, + smooth = FALSE, + graph = FALSE ) }, error = function(e){ParallelLogger::logError(e); return(NULL)} diff --git a/inst/doc/BuildingDeepLearningModels.pdf b/inst/doc/BuildingDeepLearningModels.pdf deleted file mode 100644 index a496daccb..000000000 Binary files a/inst/doc/BuildingDeepLearningModels.pdf and /dev/null differ diff --git a/inst/doc/BuildingEnsembleModels.pdf b/inst/doc/BuildingEnsembleModels.pdf deleted file mode 100644 index 37e714ebf..000000000 Binary files a/inst/doc/BuildingEnsembleModels.pdf and /dev/null differ diff --git a/man/PatientLevelPrediction.Rd b/man/PatientLevelPrediction.Rd index c3d7c5c47..8bc15fc71 100644 --- a/man/PatientLevelPrediction.Rd +++ b/man/PatientLevelPrediction.Rd @@ -2,9 +2,32 @@ % Please edit documentation in R/PatientLevelPrediction.R \docType{package} \name{PatientLevelPrediction} +\alias{PatientLevelPrediction-package} \alias{PatientLevelPrediction} \title{PatientLevelPrediction} \description{ A package for running predictions using data in the OMOP CDM +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://ohdsi.github.io/PatientLevelPrediction} + \item \url{https://github.com/OHDSI/PatientLevelPrediction} + \item Report bugs at \url{https://github.com/OHDSI/PatientLevelPrediction/issues} +} + +} +\author{ +\strong{Maintainer}: Jenna Reps \email{jreps@its.jnj.com} + +Authors: +\itemize{ + \item Martijn Schuemie + \item Marc Suchard + \item Patrick Ryan + \item Peter Rijnbeek + \item Egill Fridgeirsson +} + } \keyword{internal} diff --git a/man/getCohortCovariateData.Rd b/man/getCohortCovariateData.Rd index ff152f84b..d5333d646 100644 --- a/man/getCohortCovariateData.Rd +++ b/man/getCohortCovariateData.Rd @@ -12,7 +12,7 @@ getCohortCovariateData( cohortTable = "#cohort_person", rowIdField = "row_id", aggregated, - cohortId, + cohortIds, covariateSettings ) } @@ -31,7 +31,7 @@ getCohortCovariateData( \item{aggregated}{whether the covariate should be aggregated} -\item{cohortId}{cohort id for the target cohort} +\item{cohortIds}{cohort id for the target cohort} \item{covariateSettings}{settings for the covariate cohorts and time periods} } diff --git a/tests/testthat/helper-expectations.R b/tests/testthat/helper-expectations.R index 5405c008d..25210fdf6 100644 --- a/tests/testthat/helper-expectations.R +++ b/tests/testthat/helper-expectations.R @@ -7,13 +7,14 @@ expect_correct_fitPlp <- function(plpModel, trainData) { expect_equal(NROW(trainData$labels)*multiplicativeFactor, NROW(plpModel$prediction)) # predictions are all between 0 and 1 - expect_true(all((plpModel$prediction$value>=0) & (plpModel$prediction$value<=1))) + expect_true(all((plpModel$prediction$value >= 0) & + (plpModel$prediction$value <= 1))) # model directory exists expect_true(dir.exists(plpModel$model)) - expect_equal(plpModel$modelDesign$outcomeId,2) - expect_equal(plpModel$modelDesign$targetId,1) + expect_equal(plpModel$modelDesign$outcomeId, outcomeId) + expect_equal(plpModel$modelDesign$targetId, 1) # structure of plpModel is correct expect_equal(names(plpModel), c("model", "preprocessing", "prediction", @@ -23,8 +24,6 @@ expect_correct_fitPlp <- function(plpModel, trainData) { expect_correct_predictions <- function(predictions, testData) { # predictions are all between 0 and 1 - expect_true(all((predictions$value>=0) & (predictions$value<=1))) + expect_true(all((predictions$value >= 0) & (predictions$value <= 1))) expect_equal(NROW(testData$labels), NROW(predictions)) - - } diff --git a/tests/testthat/helper-functions.R b/tests/testthat/helper-functions.R index edcbffb8a..3a44091e8 100644 --- a/tests/testthat/helper-functions.R +++ b/tests/testthat/helper-functions.R @@ -12,11 +12,12 @@ copyTrainData <- function(trainData) { } # create tiny dataset with subset of covariates based on lasso fit -createTinyPlpData <- function(plpData, plpResult) { +createTinyPlpData <- function(plpData, plpResult, n= 20) { covariates <- plpResult$model$covariateImportance %>% - dplyr::slice_max(order_by = abs(covariateValue),n = 20, with_ties=F) %>% - dplyr::pull(covariateId) + dplyr::slice_max(order_by = abs(.data$covariateValue), + n = n, with_ties = F) %>% + dplyr::pull(.data$covariateId) tinyPlpData <- plpData tinyPlpData$covariateData <- Andromeda::copyAndromeda(plpData$covariateData) diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 29cf1b5c7..d3bd579cb 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -1,6 +1,6 @@ # this files contains the objects used in the tests: -if(Sys.getenv('GITHUB_ACTIONS') == 'true'){ +if (Sys.getenv('GITHUB_ACTIONS') == 'true') { # Download the PostreSQL driver --------------------------- # If DATABASECONNECTOR_JAR_FOLDER exists, assume driver has been downloaded jarFolder <- Sys.getenv("DATABASECONNECTOR_JAR_FOLDER", unset = "") @@ -21,7 +21,7 @@ if(Sys.getenv('GITHUB_ACTIONS') == 'true'){ PatientLevelPrediction::setPythonEnvironment(envname = 'r-reticulate', envtype = "conda") # if mac install nomkl -- trying to fix github actions - if(ifelse(is.null(Sys.info()), F, Sys.info()['sysname'] == 'Darwin')){ + if (ifelse(is.null(Sys.info()), F, Sys.info()['sysname'] == 'Darwin')){ reticulate::conda_install(envname = 'r-reticulate', packages = c('nomkl'), forge = TRUE, pip = FALSE, pip_ignore_installed = TRUE, conda = "auto") @@ -36,12 +36,34 @@ dir.create(saveLoc) #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ # simulated data Tests #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - -data(plpDataSimulationProfile, envir = environment()) +data("plpDataSimulationProfile") # PLPDATA -sampleSize <- 1500+sample(300,1) -plpData <- simulatePlpData(plpDataSimulationProfile, n = sampleSize) +connectionDetails <- Eunomia::getEunomiaConnectionDetails() +Eunomia::createCohorts(connectionDetails) +outcomeId <- 3 # GIbleed + +databaseDetails <- createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cdmDatabaseName = "main", + cohortDatabaseSchema = "main", + cohortTable = "cohort", + outcomeDatabaseSchema = "main", + outcomeTable = "cohort", + targetId = 1, + outcomeIds = outcomeId, + cdmVersion = 5) + +covariateSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsAge = TRUE, + useDemographicsGender = TRUE, + useConditionOccurrenceAnyTimePrior = TRUE +) + +plpData <- getPlpData(databaseDetails = databaseDetails, + covariateSettings = covariateSettings, + restrictPlpDataSettings = createRestrictPlpDataSettings()) # POPULATION populationSettings <- createStudyPopulationSettings( @@ -50,7 +72,7 @@ populationSettings <- createStudyPopulationSettings( removeSubjectsWithPriorOutcome = FALSE, priorOutcomeLookback = 99999, requireTimeAtRisk = T, - minTimeAtRisk=10, + minTimeAtRisk = 10, riskWindowStart = 0, startAnchor = 'cohort start', riskWindowEnd = 365, @@ -59,12 +81,12 @@ populationSettings <- createStudyPopulationSettings( # MODEL SETTINGS -lrSet <- setLassoLogisticRegression() +lrSet <- setLassoLogisticRegression(seed = 42) # RUNPLP - LASSO LR plpResult <- runPlp( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, analysisId = 'Test', analysisName = 'Testing analysis', populationSettings = populationSettings, @@ -80,7 +102,7 @@ plpResult <- runPlp( # now diagnose diagnoseResult <- diagnosePlp( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, analysisId = 'Test', populationSettings = populationSettings, splitSettings = createDefaultSplitSetting(splitSeed = 12), @@ -101,12 +123,13 @@ diagnoseResult <- diagnosePlp( population <- createStudyPopulation( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = populationSettings ) createTrainData <- function(plpData, population){ - data <- PatientLevelPrediction::splitData(plpData = plpData, population=population, + data <- PatientLevelPrediction::splitData(plpData = plpData, + population = population, splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12)) trainData <- data$Train return(trainData) @@ -115,7 +138,8 @@ createTrainData <- function(plpData, population){ trainData <- createTrainData(plpData, population) createTestData <- function(plpData, population){ - data <- PatientLevelPrediction::splitData(plpData = plpData, population=population, + data <- PatientLevelPrediction::splitData(plpData = plpData, + population = population, splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 12)) testData <- data$Test return(testData) @@ -123,11 +147,11 @@ createTestData <- function(plpData, population){ testData <- createTestData(plpData, population) -# reduced trainData to only use 10 most important features (as decided by LR) -reduceTrainData <- function(trainData) { +# reduced trainData to only use n most important features (as decided by LR) +reduceTrainData <- function(trainData, n=20) { covariates <- plpResult$model$covariateImportance %>% - dplyr::slice_max(order_by = abs(covariateValue),n = 20, with_ties = F) %>% - dplyr::pull(covariateId) + dplyr::slice_max(order_by = abs(.data$covariateValue),n = n, with_ties = F) %>% + dplyr::pull(.data$covariateId) reducedTrainData <- list(labels = trainData$labels, folds = trainData$folds, @@ -151,3 +175,16 @@ tinyTrainData <- reduceTrainData(trainData) tinyPlpData <- createTinyPlpData(plpData, plpResult) +nanoData <- createTinyPlpData(plpData, plpResult, n = 2) +tinyResults <- runPlp(plpData = nanoData, + populationSettings = populationSettings, + outcomeId = outcomeId, + analysisId = 'tinyFit', + executeSettings = createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + )) diff --git a/tests/testthat/test-KNN.R b/tests/testthat/test-KNN.R index 421a55fc2..8383b451a 100644 --- a/tests/testthat/test-KNN.R +++ b/tests/testthat/test-KNN.R @@ -1,16 +1,20 @@ test_that('KNN fit works', { - modelSettings = setKNN(k=5) - + modelSettings = setKNN(k = 2) + nanoTrainData <- reduceTrainData(tinyTrainData, n = 2) + subjectToKeep <- nanoTrainData$labels[sample.int(nrow(nanoTrainData$labels), 50),"rowId"] + nanoTrainData$labels <- nanoTrainData$labels[nanoTrainData$labels$rowId %in% subjectToKeep,] + nanoTrainData$folds <- nanoTrainData$folds[nanoTrainData$folds$rowId %in% subjectToKeep,] + nanoTrainData$covariateData$covariates <- nanoTrainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% subjectToKeep) plpModel <- fitPlp( - trainData = tinyTrainData, + trainData = nanoTrainData, modelSettings = modelSettings, analysisId = 'KNN', analysisPath = tempdir() ) - expect_correct_fitPlp(plpModel, tinyTrainData) + expect_correct_fitPlp(plpModel, nanoTrainData) }) diff --git a/tests/testthat/test-LightGBM.R b/tests/testthat/test-LightGBM.R index f3ddef8f6..35e742cfb 100644 --- a/tests/testthat/test-LightGBM.R +++ b/tests/testthat/test-LightGBM.R @@ -111,7 +111,7 @@ test_that("LightGBM working checks", { expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) - expect_equal(fitModel$modelDesign$outcomeId, 2) + expect_equal(fitModel$modelDesign$outcomeId, outcomeId) expect_equal(fitModel$modelDesign$targetId, 1) # TODO check other model design values? diff --git a/tests/testthat/test-UploadToDatabase.R b/tests/testthat/test-UploadToDatabase.R index 3184ddf13..ca30694c6 100644 --- a/tests/testthat/test-UploadToDatabase.R +++ b/tests/testthat/test-UploadToDatabase.R @@ -129,7 +129,7 @@ test_that("results uploaded to database", { } plpResult$model$validationDetails <- list( targetId = 1, - outcomeId = 3, + outcomeId = outcomeId, developmentDatabase = 'test', validationDatabase = 'test', populationSettings = plpResult$model$modelDesign$populationSettings, diff --git a/tests/testthat/test-covariateExtras.R b/tests/testthat/test-covariateExtras.R index a454761d7..17c9e04ec 100644 --- a/tests/testthat/test-covariateExtras.R +++ b/tests/testthat/test-covariateExtras.R @@ -18,10 +18,6 @@ library("testthat") context("CovariateExtras") -connectionDetails <- Eunomia::getEunomiaConnectionDetails() -Eunomia::createCohorts(connectionDetails) - - test_that("settings creation", { covSet <- createCohortCovariateSettings( @@ -113,7 +109,7 @@ covs <- FeatureExtraction::getDbCovariateData( cohortTable = "cohort", cohortDatabaseSchema = "main", cohortTableIsTemp = F, - cohortId = 1, + cohortIds = c(1), rowIdField = 'rowId', covariateSettings = covSet, aggregated = F diff --git a/tests/testthat/test-dataSplitting.R b/tests/testthat/test-dataSplitting.R index bc05e4729..b8ce628bb 100644 --- a/tests/testthat/test-dataSplitting.R +++ b/tests/testthat/test-dataSplitting.R @@ -22,7 +22,7 @@ context("Data splitting") populationT <- plpData$cohorts populationT$outcomeCount <- sample(c(0,1), nrow(populationT), replace = T) -attr(populationT, "metaData")$outcomeId <- 2 +attr(populationT, "metaData")$outcomeId <- outcomeId attr(populationT, "metaData")$populationSettings <- list(madeup = T) attr(populationT, "metaData")$restrictPlpDataSettings <- list(madeup = T) attr(populationT, "metaData")$attrition <- c(1,2,3) diff --git a/tests/testthat/test-diagnostic.R b/tests/testthat/test-diagnostic.R index 3acb2f5ae..2c261dd16 100644 --- a/tests/testthat/test-diagnostic.R +++ b/tests/testthat/test-diagnostic.R @@ -62,8 +62,8 @@ test_that("getMaxEndDaysFromCovariates works", { test_that("test diagnosePlp works", { test <- diagnosePlp( - plpData = plpData, - outcomeId = 2, + plpData = tinyPlpData, + outcomeId = outcomeId, analysisId = 'diagnoseTest', populationSettings = createStudyPopulationSettings( riskWindowStart = 1, @@ -112,28 +112,14 @@ test_that("test diagnosePlp works", { test_that("test diagnoseMultiplePlp works", { - connectionDetails <- Eunomia::getEunomiaConnectionDetails() - Eunomia::createCohorts(connectionDetails) - - databaseDetails <- createDatabaseDetails( - connectionDetails = connectionDetails, - cdmDatabaseSchema = "main", - cdmDatabaseName = "main", - cohortDatabaseSchema = "main", - cohortTable = "cohort", - outcomeDatabaseSchema = "main", - outcomeTable = "cohort", - targetId = 1, - outcomeIds = 3, #make this ids - cdmVersion = 5 - ) - analysis1 <- createModelDesign( targetId = 1, - outcomeId = 3, - restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), + outcomeId = outcomeId, + restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, + washoutPeriod = 0, + sampleSize = 100), populationSettings = createStudyPopulationSettings(), - covariateSettings = FeatureExtraction::createDefaultCovariateSettings(), + covariateSettings = covariateSettings, featureEngineeringSettings = NULL, sampleSettings = NULL, splitSettings = createDefaultSplitSetting(), @@ -143,10 +129,12 @@ test_that("test diagnoseMultiplePlp works", { analysis2 <- createModelDesign( targetId = 1, - outcomeId = 3, - restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), + outcomeId = outcomeId, + restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, + washoutPeriod = 0, + sampleSize = 100), populationSettings = createStudyPopulationSettings(washoutPeriod = 400), - covariateSettings = FeatureExtraction::createDefaultCovariateSettings(), + covariateSettings = covariateSettings, featureEngineeringSettings = NULL, sampleSettings = NULL, splitSettings = createDefaultSplitSetting(), @@ -161,7 +149,7 @@ test_that("test diagnoseMultiplePlp works", { analysis2 ), cohortDefinitions = data.frame( - cohortId = c(1,3), + cohortId = c(1, outcomeId), cohortName = c('target', 'outcome') ), saveDirectory = file.path(saveLoc, 'diagnosticsMultiple') diff --git a/tests/testthat/test-evaluation.R b/tests/testthat/test-evaluation.R index 0133caed5..6669aa488 100644 --- a/tests/testthat/test-evaluation.R +++ b/tests/testthat/test-evaluation.R @@ -38,15 +38,15 @@ test_that("modelBasedConcordance", { }) test_that("evaluatePlp_survival", { - + N <- 100 plpResultSurvivalPred <- data.frame( - rowId = 1:300, - ageYear = sample(100, 300, replace = T), - gender = sample(c('8507','8532'), 300, replace = T), - outcomeCount = c(rep(1,40), rep(0,260)), - value = runif(300), - evaluationType = rep('Train', 300), - survivalTime = sample(2000, 300, replace = T) + rowId = 1:N, + ageYear = sample(100, N, replace = T), + gender = sample(c('8507','8532'), N, replace = T), + outcomeCount = c(rep(1,N*0.1), rep(0,N*0.9)), + value = runif(N, max=0.1), + evaluationType = rep('Train', N), + survivalTime = sample(2000, N, replace = T) ) attr(plpResultSurvivalPred, "metaData")$modelType <- 'survival' attr(plpResultSurvivalPred, 'metaData')$timepoint <- 365 diff --git a/tests/testthat/test-extractData.R b/tests/testthat/test-extractData.R index 45596858f..0e15eb37a 100644 --- a/tests/testthat/test-extractData.R +++ b/tests/testthat/test-extractData.R @@ -17,7 +17,7 @@ context("extractPlp") test_that("summary.plpData", { - attr(plpData$outcomes, "metaData")$outcomeIds <- c(2,3) + attr(plpData$outcomes, "metaData")$outcomeIds <- c(outcomeId) sum <- summary.plpData(plpData) testthat::expect_equal(class(sum),'summary.plpData') }) @@ -55,7 +55,7 @@ test_that("createDatabaseDetails with NULL cdmDatabaseId errors", { cdmDatabaseSchema = 'main', cdmDatabaseId = NULL, targetId = 1, - outcomeIds = 2 + outcomeIds = outcomeId )) }) diff --git a/tests/testthat/test-featureImportance.R b/tests/testthat/test-featureImportance.R index 193f43b41..d0140adda 100644 --- a/tests/testthat/test-featureImportance.R +++ b/tests/testthat/test-featureImportance.R @@ -21,21 +21,24 @@ context("FeatureImportance") test_that("pfi feature importance returns data.frame", { - # limit to a sample of 10 covariates for faster test + # limit to a sample of 2 covariates for faster test covariates <- plpResult$model$covariateImportance %>% dplyr::filter("covariateValue" != 0) %>% dplyr::select("covariateId") %>% + dplyr::arrange(desc("covariateValue")) %>% dplyr::pull() # if the model had non-zero covariates if(length(covariates) > 0){ - covariates <- sample(covariates, min(10,length(covariates))) + if (length(covariates) > 2) { + covariates <- covariates[1:2] + } pfiTest <- pfi(plpResult, population, plpData, repeats = 1, - covariates = covariates, cores = NULL, log = NULL, + covariates = covariates, cores = 1, log = NULL, logthreshold = "INFO") expect_equal(class(pfiTest), 'data.frame') - expect_equal(sum(names(pfiTest)%in%c("covariateId", "pfi")), 2) + expect_equal(sum(names(pfiTest) %in% c("covariateId", "pfi")), 2) expect_true(all(!is.nan(pfiTest$pfi))) } @@ -43,25 +46,12 @@ test_that("pfi feature importance returns data.frame", { }) test_that('pfi feature importance works with logger or without covariates', { - tinyResults <- runPlp(plpData = tinyPlpData, - populationSettings = populationSettings, - outcomeId = 2, - analysisId = 'tinyFit', - featureEngineeringSettings = createUnivariateFeatureSelection(k=20), - executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = T, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - )) - - pfiTest <- pfi(tinyResults, population, tinyPlpData, + + pfiTest <- pfi(tinyResults, population, nanoData, cores = 1, covariates = NULL, log = file.path(tempdir(), 'pfiLog')) expect_equal(class(pfiTest), 'data.frame') - expect_equal(sum(names(pfiTest)%in%c("covariateId", "pfi")), 2) + expect_equal(sum(names(pfiTest) %in% c("covariateId", "pfi")), 2) expect_true(all(!is.nan(pfiTest$pfi))) }) diff --git a/tests/testthat/test-formatting.R b/tests/testthat/test-formatting.R index 63e47dd60..0e3ac0121 100644 --- a/tests/testthat/test-formatting.R +++ b/tests/testthat/test-formatting.R @@ -115,7 +115,7 @@ test_that("toSparseM", { outcomes=2)) outcomes <- data.frame(rowId=c(1,2), - outcomeId=rep(2,2), + outcomeId=rep(outcomeId,2), daysToEvent=c(150,40)) FplpData <- list(cohorts=cohorts, diff --git a/tests/testthat/test-learningCurves.R b/tests/testthat/test-learningCurves.R index a51537209..15da60bcf 100644 --- a/tests/testthat/test-learningCurves.R +++ b/tests/testthat/test-learningCurves.R @@ -19,7 +19,7 @@ context("LearningCurves") # learningCurve learningCurve <- PatientLevelPrediction::createLearningCurve( plpData = plpData, - outcomeId = 2, parallel = F, cores = -1, + outcomeId = outcomeId, parallel = F, cores = -1, modelSettings = setLassoLogisticRegression(), saveDirectory = file.path(saveLoc, 'lcc'), splitSettings = createDefaultSplitSetting(testFraction = 0.2, nfold=2), @@ -68,10 +68,11 @@ test_that("getTrainFractions works", { learningCurve <- PatientLevelPrediction::createLearningCurve( plpData = tinyPlpData, - outcomeId = 2, parallel = F, cores = -1, - modelSettings = setLassoLogisticRegression(), + outcomeId = outcomeId, parallel = F, cores = -1, + modelSettings = setLassoLogisticRegression(seed = 42), saveDirectory = file.path(saveLoc, 'lcc'), - splitSettings = createDefaultSplitSetting(testFraction = 0.33, nfold=2), + splitSettings = createDefaultSplitSetting(testFraction = 0.33, nfold = 2, + splitSeed = 42), trainEvents = c(150,200), preprocessSettings = createPreprocessSettings( minFraction = 0.001, @@ -79,7 +80,7 @@ test_that("getTrainFractions works", { ) ) testthat::expect_true(is.data.frame(learningCurve)) - testthat::expect_equal(sum(colnames(learningCurve)%in%c( + testthat::expect_equal(sum(colnames(learningCurve) %in% c( "trainFraction", "Train_AUROC", "nPredictors", diff --git a/tests/testthat/test-multiplePlp.R b/tests/testthat/test-multiplePlp.R index 8b4a0aad6..14cf651dc 100644 --- a/tests/testthat/test-multiplePlp.R +++ b/tests/testthat/test-multiplePlp.R @@ -17,29 +17,12 @@ library("testthat") context("MultiplePlp") -connectionDetails <- Eunomia::getEunomiaConnectionDetails() -Eunomia::createCohorts(connectionDetails) - -databaseDetails <- createDatabaseDetails( - connectionDetails = connectionDetails, - cdmDatabaseSchema = "main", - cdmDatabaseName = "main", - cohortDatabaseSchema = "main", - cohortTable = "cohort", - outcomeDatabaseSchema = "main", - outcomeTable = "cohort", - targetId = 1, - outcomeIds = 3, #make this ids - cdmVersion = 5 - ) - - analysis1 <- createModelDesign( targetId = 1, - outcomeId = 3, + outcomeId = outcomeId, restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), populationSettings = createStudyPopulationSettings(), - covariateSettings = FeatureExtraction::createDefaultCovariateSettings(), + covariateSettings = covariateSettings, featureEngineeringSettings = NULL, sampleSettings = NULL, splitSettings = createDefaultSplitSetting(splitSeed = 1), @@ -50,9 +33,9 @@ analysis1 <- createModelDesign( test_that("createModelDesign - test working", { expect_equal(analysis1$targetId, 1) - expect_equal(analysis1$outcomeId, 3) + expect_equal(analysis1$outcomeId, outcomeId) expect_equal(analysis1$restrictPlpDataSettings, createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0)) - expect_equal(analysis1$covariateSettings, FeatureExtraction::createDefaultCovariateSettings()) + expect_equal(analysis1$covariateSettings, covariateSettings) expect_equal(analysis1$featureEngineeringSettings, list(createFeatureEngineeringSettings(type= "none"))) expect_equal(analysis1$sampleSettings, list(createSampleSettings(type = 'none'))) expect_equal(analysis1$preprocessSettings, createPreprocessSettings()) @@ -103,27 +86,14 @@ test_that("loading analyses settings", { } ) -analysis2 <- createModelDesign( - targetId = 10, - outcomeId = 2, - restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 9999), - populationSettings = createStudyPopulationSettings(), - covariateSettings = FeatureExtraction::createCovariateSettings(useDemographicsAge = T), - featureEngineeringSettings = NULL, - sampleSettings = NULL, - preprocessSettings = createPreprocessSettings(), - modelSettings = setLassoLogisticRegression(seed = 12) -) - - test_that("test run multiple", { analysis3 <- createModelDesign( targetId = 1, - outcomeId = 3, + outcomeId = outcomeId, restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), populationSettings = createStudyPopulationSettings(), - covariateSettings = FeatureExtraction::createDefaultCovariateSettings(), + covariateSettings = covariateSettings, featureEngineeringSettings = createFeatureEngineeringSettings(), sampleSettings = createSampleSettings(), preprocessSettings = createPreprocessSettings(), @@ -134,7 +104,8 @@ test_that("test run multiple", { trainFraction = 0.75, splitSeed = 123, nfold = 3 - ) + ), + runCovariateSummary = FALSE ) runMultiplePlp( diff --git a/tests/testthat/test-plotting.R b/tests/testthat/test-plotting.R index a3989c9b6..17039166e 100644 --- a/tests/testthat/test-plotting.R +++ b/tests/testthat/test-plotting.R @@ -38,7 +38,7 @@ test_that("plots", { test <- plotF1Measure(plpResult, typeColumn = 'evaluation') testthat::expect_s3_class(test, 'arrangelist') - if(!is.null(plpResult$performanceEvaluation$demographicSummary)){ + if (!is.null(plpResult$performanceEvaluation$demographicSummary)) { test <- plotDemographicSummary(plpResult, typeColumn = 'evaluation') testthat::expect_s3_class(test, 'arrangelist') } @@ -52,7 +52,7 @@ test_that("plots", { test <- plotVariableScatterplot(plpResult$covariateSummary) testthat::expect_s3_class(test, 'ggplot') - test <- plotGeneralizability(plpResult$covariateSummary, fileName=NULL) + test <- plotGeneralizability(plpResult$covariateSummary, fileName = NULL) testthat::expect_s3_class(test, 'grob') }) @@ -61,7 +61,7 @@ test_that("plots", { test_that("outcomeSurvivalPlot", { # test the plot works - test <- outcomeSurvivalPlot(plpData = plpData, outcomeId = 2) + test <- outcomeSurvivalPlot(plpData = plpData, outcomeId = outcomeId) testthat::expect_s3_class(test, 'ggsurvplot') testthat::expect_error(outcomeSurvivalPlot()) @@ -82,7 +82,7 @@ test_that("plotPlp", { testthat::expect_equal(dir.exists(file.path(saveLoc,'plots')), T) # expect plots to be there - expect_true(length(dir(file.path(saveLoc,'plots')))>0) + expect_true(length(dir(file.path(saveLoc,'plots'))) > 0) }) @@ -134,10 +134,4 @@ test_that("plotSmoothCalibration", { ) ) -}) - - - - - - +}) \ No newline at end of file diff --git a/tests/testthat/test-population.R b/tests/testthat/test-population.R index 91f533c6e..6add83fee 100644 --- a/tests/testthat/test-population.R +++ b/tests/testthat/test-population.R @@ -293,7 +293,7 @@ test_that("population creation parameters", { studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings() ) @@ -306,7 +306,7 @@ test_that("population creation parameters", { #firstExposureOnly test (should have no effect on simulated data) studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(firstExposureOnly = T) ) @@ -317,7 +317,7 @@ test_that("population creation parameters", { #requireTimeAtRisk studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(requireTimeAtRisk = T) ) @@ -330,7 +330,7 @@ test_that("population creation parameters", { expect_warning( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(requireTimeAtRisk = T, minTimeAtRisk = 99999) ) ) @@ -338,7 +338,7 @@ test_that("population creation parameters", { #washoutPeriod = 365, studyPopulation <- createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(washoutPeriod = 365) ) nrOutcomes4 <- sum(studyPopulation$outcomeCount) @@ -349,7 +349,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings(washoutPeriod = -1) ) ) @@ -358,7 +358,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( priorOutcomeLookback = -1, removeSubjectsWithPriorOutcome = T @@ -370,7 +370,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( minTimeAtRisk = -1, requireTimeAtRisk = T @@ -382,7 +382,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( startAnchor = 'cohort stard' ) @@ -394,7 +394,7 @@ test_that("population creation parameters", { expect_error( createStudyPopulation( plpData = plpData, - outcomeId = 3, + outcomeId = outcomeId, populationSettings = defaultSettings( endAnchor = 'cohort ent' ) @@ -495,4 +495,31 @@ test_that("population creation parameters", { }) +testthat::test_that("Providing an existing population and skipping population creation works", { + popSize <- 400 + newPopulation <- population[sample.int(nrow.default(population), popSize), ] + + tinyPlpData$population <- newPopulation + + plpResults <- runPlp( + plpData = tinyPlpData, + outcomeId = 2, + analysisId = "1", + analysisName = "existing population", + populationSettings = createStudyPopulationSettings(), + splitSettings = createDefaultSplitSetting(), + modelSettings = setLassoLogisticRegression(), + executeSettings = createExecuteSettings( + runSplitData = TRUE, + runPreprocessData = FALSE, + runModelDevelopment = TRUE + ) + ) + + trainPredictions <- plpResults$prediction %>% + dplyr::filter(.data$evaluationType == "Train") %>% nrow.default() + testPredictions <- plpResults$prediction %>% + dplyr::filter(.data$evaluationType == "Test") %>% nrow.default() + expect_equal(popSize, trainPredictions + testPredictions) +}) diff --git a/tests/testthat/test-rclassifier.R b/tests/testthat/test-rclassifier.R index 54195ca08..5a0dadc31 100644 --- a/tests/testthat/test-rclassifier.R +++ b/tests/testthat/test-rclassifier.R @@ -108,7 +108,7 @@ test_that("GBM working checks", { expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) - expect_equal(fitModel$modelDesign$outcomeId, 2) + expect_equal(fitModel$modelDesign$outcomeId, outcomeId) expect_equal(fitModel$modelDesign$targetId, 1) # TODO check other model design values? diff --git a/tests/testthat/test-recalibration.R b/tests/testthat/test-recalibration.R index 2cc4a9257..33e717197 100644 --- a/tests/testthat/test-recalibration.R +++ b/tests/testthat/test-recalibration.R @@ -32,7 +32,7 @@ prediction <- data.frame( metaData <- list( modelType = "binary", targetId = 1, - outcomeId = 2, + outcomeId = outcomeId, timepoint = 365 ) @@ -66,7 +66,7 @@ test_that("recalibratePlpRefit", { newPop <- plpResult$prediction %>% dplyr::select(-"value") %>% dplyr::filter(.data$evaluationType %in% c('Test','Train')) attr(newPop, 'metaData') <- list( targetId = 1, - outcomeId = 2, + outcomeId = outcomeId, restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(), populationSettings = PatientLevelPrediction::createStudyPopulationSettings() ) @@ -95,7 +95,7 @@ test_that("survival", { metaData <- list( modelType = "survival", targetId = 1, - outcomeId = 2, + outcomeId = outcomeId, timepoint = 365 ) diff --git a/tests/testthat/test-runPlpHelpers.R b/tests/testthat/test-runPlpHelpers.R index 2256a6450..4d361410d 100644 --- a/tests/testthat/test-runPlpHelpers.R +++ b/tests/testthat/test-runPlpHelpers.R @@ -22,7 +22,7 @@ test_that("check printHeader runs", { header <- printHeader( plpData = plpData, targetId = 1, - outcomeId = 2, + outcomeId = outcomeId, analysisId = 123, analysisName = 'test', ExecutionDateTime = Sys.time() @@ -36,7 +36,7 @@ test_that("checkInputs", { check <- checkInputs( list( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = populationSettings ) ) @@ -49,7 +49,7 @@ test_that("checkInputs", { checkInputs( list( plpData = NULL, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = populationSettings ) ) @@ -71,7 +71,7 @@ test_that("checkInputs", { checkInputs( list( plpData = plpData, - outcomeId = 2, + outcomeId = outcomeId, populationSettings = 'populationSettings' ) ) diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R index db3480392..84ee78516 100644 --- a/tests/testthat/test-validation.R +++ b/tests/testthat/test-validation.R @@ -17,22 +17,6 @@ context("Validation") # Test unit for the creation of the study externalValidatePlp - -connectionDetails <- Eunomia::getEunomiaConnectionDetails() -Eunomia::createCohorts(connectionDetails) - -databaseDetails <- createDatabaseDetails( - connectionDetails = connectionDetails, - cdmDatabaseSchema = "main", - cdmDatabaseName = "main", - cohortDatabaseSchema = "main", - cohortTable = "cohort", - outcomeDatabaseSchema = "main", - outcomeTable = "cohort", - targetId = 1, - outcomeIds = 3, #make this ids - cdmVersion = 5) - modelVal <- loadPlpModel(file.path(saveLoc, 'Test', 'plpResult', 'model')) validationDatabaseDetailsVal <- databaseDetails # from run multiple tests validationRestrictPlpDataSettingsVal <- createRestrictPlpDataSettings(washoutPeriod = 0, sampleSize = NULL) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 000000000..097b24163 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/BenchmarkTasks.Rmd b/vignettes/BenchmarkTasks.Rmd index 13eb511ab..4c452aa99 100644 --- a/vignettes/BenchmarkTasks.Rmd +++ b/vignettes/BenchmarkTasks.Rmd @@ -25,7 +25,7 @@ output: --- ## Benchmark Tasks For Large-Scale Empirical Analyses diff --git a/vignettes/ConstrainedPredictors.Rmd b/vignettes/ConstrainedPredictors.Rmd index 2776122cb..8c1daadec 100644 --- a/vignettes/ConstrainedPredictors.Rmd +++ b/vignettes/ConstrainedPredictors.Rmd @@ -25,7 +25,7 @@ output: --- ## Constrained Predictors @@ -36,20 +36,20 @@ Here we provide a set of phenotypes that can be used as predictors in prediction These phenotypes can be extracted from the PhenotypeLibrary R package. To install the R package run: -```{r echo = T} +```{r, echo = TRUE, message = FALSE, warning = FALSE, tidy = FALSE, eval=FALSE} remotes::install_github('ohdsi/PhenotypeLibrary') ``` To extract the cohort definition for Alcoholism with an id of 1165, just run: -```{r echo = T} +```{r echo = TRUE, message = FALSE, warning = FALSE, tidy = FALSE, eval = FALSE} PhenotypeLibrary::getPlCohortDefinitionSet(1165) ``` in general you can extract all the cohorts by running: -```{r echo = T} +```{r echo = TRUE, message = FALSE, warning = FALSE, tidy = FALSE, eval = FALSE} phenotypeDefinitions <- PhenotypeLibrary::getPlCohortDefinitionSet(1152:1215) ```