Skip to content

Commit

Permalink
downloadTasks and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
egillax committed Oct 17, 2024
1 parent 6137760 commit 16f8d73
Show file tree
Hide file tree
Showing 2 changed files with 161 additions and 17 deletions.
36 changes: 19 additions & 17 deletions R/ExternalValidatePlp.R
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ validateExternal <- function(validationDesignList,
ParallelLogger::registerLogger(logger)
on.exit(closeLog(logger))

downloadTasks <- extractUniqueCombinations(validationDesignList)
downloadTasks <- createDownloadTasks(validationDesignList)

results <- NULL
for (design in validationDesignList) {
Expand All @@ -488,12 +488,10 @@ validateExternal <- function(validationDesignList,
"with targetId:", design$targetId, "and outcomeId:", design$outcomeId))

modelDesigns <- extractModelDesigns(design$plpModelList)
allCovSettings <- lapply(modelDesigns, function(x) x$covariateSettings)
design <- fromDesignOrModel(design, modelDesigns, "restrictPlpDataSettings")
# checkAllSameInModels(allCovSettings, "covariateSettings")

# get plpData
plpData <- getData(design, database, outputFolder, allCovSettings, downloadTasks)
plpData <- getData(design, database, outputFolder, downloadTasks)
if (is.null(plpData)) {
ParallelLogger::logInfo("Couldn't extract plpData for the given design and database, proceeding to the next one.")
next
Expand Down Expand Up @@ -608,7 +606,7 @@ checkAllSameInModels <- function(settingsList, settingName) {
identical(y, settingsList[[1]])},
settingsList[-1],
init = TRUE)) {
stop(paste0(settingName, "are not the same across models which is not supported yet"))
stop(paste0(settingName, "are not the same across models which is not supported"))
}
}

Expand Down Expand Up @@ -688,12 +686,11 @@ fromDesignOrModel <- function(validationDesign, modelDesigns, settingName) {
#' @param design The validationDesign object
#' @param database The databaseDetails object
#' @param outputFolder The directory to save the validation results to
#' @param allCovSettings A list of covariateSettings from the models
#' @param downloadTasks A list of download tasks determined by unique
#' combinations of targetId and restrictPlpDataSettings
#' @return The plpData object
#' @keywords internal
getData <- function(design, database, outputFolder, allCovSettings, downloadTasks) {
getData <- function(design, database, outputFolder, downloadTasks) {
# find task associated with design and the index of the task in downloadTasks
task <- downloadTasks %>%
dplyr::filter(.data$targetId == design$targetId) %>%
Expand Down Expand Up @@ -763,26 +760,31 @@ getPopulation <- function(validationDesign, modelDesigns, plpData) {
return(population)
}

#' extractUniqueCombinations
#' createDownloadTasks
#' create download tasks based on unique combinations of targetId and
#' restrictPlpDataSettings. This is used to avoid downloading the same data
#' multiple times.
#' restrictPlpDataSettings. It adds all covariateSettings and outcomes that
#' have that targetId and restrictPlpDataSettings. This is used to avoid
#' downloading the same data multiple times.
#' @param validationDesignList A list of validationDesign objects
#' @return A list of download tasks
#' @return A dataframe where each row is a downloadTask
#' @keywords internal
extractUniqueCombinations <- function(validationDesignList) {
# TODO currentl works for list of modelPaths, not with objects.
# TODO where restrictPlpDatasettings is empty, take from model and take that into account when creating tasks

createDownloadTasks <- function(validationDesignList) {
ParallelLogger::logInfo("Extracting unique combinations of targetId, \
restrictPlpDataSettings and covariateSettings for extracting data")

rowsList <- list()
modelCache <- list()

for (design in validationDesignList) {
targetId <- design$targetId
outcomeId <- design$outcomeId
restrictPlpDataSettings <- design$restrictPlpDataSettings
if (is.null(restrictPlpDataSettings)) {
restrictList <- lapply(design$plpModelList,
function(x) x$modelDesign$restrictPlpDataSettings)
checkAllSameInModels(restrictList, "restrictPlpDataSettings")
restrictPlpDataSettings <- restrictList[[1]]
}
plpModelList <- design$plpModelList
for (model in plpModelList) {
if (is.character(model)) {
Expand Down Expand Up @@ -823,7 +825,7 @@ extractUniqueCombinations <- function(validationDesignList) {
return(uniqueSettings)
}

uniqueCombinations <- rowsDf %>%
downloadTasks <- rowsDf %>%
dplyr::group_by(.data$targetId, .data$restrictKey) %>%
dplyr::summarise(
outcomeIds = list(unique(.data$outcomeIds)),
Expand All @@ -833,5 +835,5 @@ extractUniqueCombinations <- function(validationDesignList) {
dplyr::select(c("targetId", "outcomeIds", "restrictPlpDataSettings",
"covariateSettings"))

return(uniqueCombinations)
return(downloadTasks)
}
142 changes: 142 additions & 0 deletions tests/testthat/test-validation.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,145 @@ test_that("createValidationSettings errors with <10 outcomes", {
"skipping validation for design and database")

})

test_that("createDownloadTasks handles single design correctly", {
design <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
result <- createDownloadTasks(list(design))
expect_s3_class(result, "data.frame")
expect_equal(nrow(result), 1)
expect_equal(ncol(result), 4)
})

test_that("createDownloadTasks handles multiple designs correctly", {
design1 <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
design2 <- createValidationDesign(
targetId = 3,
outcomeId = 4,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
result <- createDownloadTasks(list(design1, design2))
expect_s3_class(result, "data.frame")
expect_equal(nrow(result), 2)
expect_equal(ncol(result), 4)
})

test_that("createDownloadTasks handles duplicated designs correctly", {
design <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
result <- createDownloadTasks(list(design, design))
expect_s3_class(result, "data.frame")
expect_equal(nrow(result), 1)

design2 <- createValidationDesign(
targetId = 3,
outcomeId = 4,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)

results <- createDownloadTasks(list(design, design2, design))
expect_s3_class(results, "data.frame")
expect_equal(nrow(results), 2)
})

test_that("createDownloadTasks with different restrictSettings", {
design <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
design2 <- createValidationDesign(
targetId = 3,
outcomeId = 4,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
design3 <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings(sampleSize = 100)
)

result <- createDownloadTasks(list(design, design2, design3))

expect_s3_class(result, "data.frame")
expect_equal(nrow(result), 3)
})

test_that("createDownloadTasks works with multiple outcomeIds", {
design1 <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
design2 <- createValidationDesign(
targetId = 1,
outcomeId = 3,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
result <- createDownloadTasks(list(design1, design2))
expect_s3_class(result, "data.frame")
expect_equal(nrow(result), 1)
expect_equal(length(result[1, ]$outcomeIds[[1]]), 2)

design3 <- createValidationDesign(
targetId = 1,
outcomeId = 3,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings(sampleSize = 100)
)
result <- createDownloadTasks(list(design1, design2, design3))
expect_equal(nrow(result), 2)
})

test_that("createDownloadTasks with multiple covSettings", {
modelVal2 <- modelVal
design1 <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
modelVal2$modelDesign$covariateSettings <-
FeatureExtraction::createCovariateSettings(useChads2 = TRUE)
design2 <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal2),
restrictPlpDataSettings = createRestrictPlpDataSettings()
)
result <- createDownloadTasks(list(design1, design2))
expect_equal(nrow(result), 1)
expect_equal(length(result[1, ]$covariateSettings)[[1]], 2)

})

test_that("createDownloadTasks when restrictSettings come from models", {
design1 <- createValidationDesign(
targetId = 1,
outcomeId = 2,
plpModelList = list(modelVal)
)
result <- createDownloadTasks(list(design1))
expect_s3_class(result[1, ]$restrictPlpDataSettings[[1]], "restrictPlpDataSettings")

})

0 comments on commit 16f8d73

Please sign in to comment.