diff --git a/.Rbuildignore b/.Rbuildignore index ac33fdc..307ed11 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -8,3 +8,4 @@ ^extras$ ^deploy.sh$ ^compare_versions$ +^.mypy_cache$ \ No newline at end of file diff --git a/.gitignore b/.gitignore index 94cb359..feec6ee 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ docs .idea/ renv.lock extras/ -.Renviron \ No newline at end of file +.Renviron +inst/python/__pycache__ +.mypy_cache diff --git a/DESCRIPTION b/DESCRIPTION index d4c1a53..91bb9a3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: DeepPatientLevelPrediction Type: Package Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model -Version: 2.0.1 +Version: 2.0.2 Date: 18-04-2023 Authors@R: c( person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")), @@ -35,7 +35,8 @@ Suggests: testthat, PRROC, ResultModelManager (>= 0.2.0), - DatabaseConnector (>= 6.0.0) + DatabaseConnector (>= 6.0.0), + Andromeda Remotes: ohdsi/PatientLevelPrediction, ohdsi/FeatureExtraction, @@ -51,7 +52,6 @@ Config/reticulate: list(package = "polars"), list(package = "tqdm"), list(package = "connectorx"), - list(package = "scikit-learn"), list(package = "pyarrow") ) ) diff --git a/NAMESPACE b/NAMESPACE index 4806f0d..5f8c13e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(TrainingCache) export(fitEstimator) export(gridCvDeep) export(predictDeepEstimator) @@ -10,6 +9,7 @@ export(setEstimator) export(setMultiLayerPerceptron) export(setResNet) export(setTransformer) +export(trainingCache) importFrom(dplyr,"%>%") importFrom(reticulate,py_to_r) importFrom(reticulate,r_to_py) diff --git a/NEWS.md b/NEWS.md index 70ccf12..961ac75 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +DeepPatientLevelPrediction 2.0.2 +====================== + - Ensure output from predict_proba is numeric instead of 1d array + - Refactoring: Move cross-validation to a separate function + - Refactoring: Move paramsToTune to a separate function + - linting: Enforcing HADES style + - Calculate AUC ourselves with torch, get rid of scikit-learn dependancy + - added Andromeda to dev dependencies + + DeepPatientLevelPrediction 2.0.1 ====================== - Connection parameter fixed to be in line with newest polars diff --git a/R/Dataset.R b/R/Dataset.R index f4b5170..4ba4e81 100644 --- a/R/Dataset.R +++ b/R/Dataset.R @@ -15,21 +15,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -createDataset <- function(data, labels, plpModel=NULL) { +createDataset <- function(data, labels, plpModel = NULL) { path <- system.file("python", package = "DeepPatientLevelPrediction") - Dataset <- reticulate::import_from_path("Dataset", path = path)$Data + dataset <- reticulate::import_from_path("Dataset", path = path)$Data if (is.null(attributes(data)$path)) { # sqlite object attributes(data)$path <- attributes(data)$dbname } if (is.null(plpModel)) { - data <- Dataset(r_to_py(normalizePath(attributes(data)$path)), - r_to_py(labels$outcomeCount)) + data <- dataset(r_to_py(normalizePath(attributes(data)$path)), + r_to_py(labels$outcomeCount)) + } else { + numericalFeatures <- + r_to_py(as.array(which(plpModel$covariateImportance$isNumeric))) + data <- dataset(r_to_py(normalizePath(attributes(data)$path)), + numerical_features = numericalFeatures) } - else { - data <- Dataset(r_to_py(normalizePath(attributes(data)$path)), - numerical_features = r_to_py(as.array(which(plpModel$covariateImportance$isNumeric))) ) - } - - return(data) -} \ No newline at end of file + + return(data) +} diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R index e77812e..137b53a 100644 --- a/R/DeepPatientLevelPrediction.R +++ b/R/DeepPatientLevelPrediction.R @@ -18,7 +18,8 @@ #' DeepPatientLevelPrediction #' -#' @description A package containing deep learning extensions for developing prediction models using data in the OMOP CDM +#' @description A package containing deep learning extensions for developing +#' prediction models using data in the OMOP CDM #' #' @docType package #' @name DeepPatientLevelPrediction @@ -28,9 +29,7 @@ NULL .onLoad <- function(libname, pkgname) { - # use superassignment to update global reference + # use superassignment to update global reference reticulate::configure_environment(pkgname) torch <<- reticulate::import("torch", delay_load = TRUE) } - - diff --git a/R/Estimator.R b/R/Estimator.R index 7705279..4ef2047 100644 --- a/R/Estimator.R +++ b/R/Estimator.R @@ -26,37 +26,41 @@ #' @param weightDecay what weight_decay to use #' @param batchSize batchSize to use #' @param epochs how many epochs to train for -#' @param device what device to train on, can be a string or a function to that evaluates +#' @param device what device to train on, can be a string or a function to +#' that evaluates #' to the device during runtime #' @param optimizer which optimizer to use #' @param scheduler which learning rate scheduler to use #' @param criterion loss function to use -#' @param earlyStopping If earlyStopping should be used which stops the training of your metric is not improving -#' @param metric either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. -#' Needs to be a list with function `fun`, mode either `min` or `max` and a `name`, -#' `fun` needs to be a function that takes in prediction and labels and outputs a score. +#' @param earlyStopping If earlyStopping should be used which stops the +#' training of your metric is not improving +#' @param metric either `auc` or `loss` or a custom metric to use. This is the +#' metric used for scheduler and earlyStopping. +#' Needs to be a list with function `fun`, mode either `min` or `max` and a +#' `name`, +#' `fun` needs to be a function that takes in prediction and labels and +#' outputs a score. #' @param seed seed to initialize weights of model with #' @export -setEstimator <- function(learningRate='auto', - weightDecay = 0.0, - batchSize = 512, - epochs = 30, - device='cpu', - optimizer = torch$optim$AdamW, - scheduler = list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau, - params=list(patience=1)), - criterion = torch$nn$BCEWithLogitsLoss, - earlyStopping = list(useEarlyStopping=TRUE, - params = list(patience=4)), - metric = "auc", - seed = NULL +setEstimator <- function(learningRate = "auto", + weightDecay = 0.0, + batchSize = 512, + epochs = 30, + device = "cpu", + optimizer = torch$optim$AdamW, + scheduler = list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau, + params = list(patience = 1)), + criterion = torch$nn$BCEWithLogitsLoss, + earlyStopping = list(useEarlyStopping = TRUE, + params = list(patience = 4)), + metric = "auc", + seed = NULL ) { - + checkIsClass(learningRate, c("numeric", "character")) - if (inherits(learningRate, "character")) { - if (learningRate != "auto"){ - stop(paste0('Learning rate should be either a numeric or "auto", you provided: ', learningRate)) - } + if (inherits(learningRate, "character") && learningRate != "auto") { + stop(paste0('Learning rate should be either a numeric or "auto", + you provided: ', learningRate)) } checkIsClass(weightDecay, "numeric") checkHigherEqual(weightDecay, 0.0) @@ -67,58 +71,50 @@ setEstimator <- function(learningRate='auto', checkIsClass(earlyStopping, c("list", "NULL")) checkIsClass(metric, c("character", "list")) checkIsClass(seed, c("numeric", "integer", "NULL")) - - - if (length(learningRate)==1 && learningRate=='auto') {findLR <- TRUE} else {findLR <- FALSE} + + + if (length(learningRate) == 1 && learningRate == "auto") { + findLR <- TRUE + } else { + findLR <- FALSE + } if (is.null(seed)) { seed <- as.integer(sample(1e5, 1)) } - estimatorSettings <- list(learningRate=learningRate, - weightDecay=weightDecay, - batchSize=batchSize, - epochs=epochs, - device=device, - earlyStopping=earlyStopping, - findLR=findLR, - metric=metric, - seed=seed[1]) - - optimizer <- rlang::enquo(optimizer) + estimatorSettings <- list(learningRate = learningRate, + weightDecay = weightDecay, + batchSize = batchSize, + epochs = epochs, + device = device, + earlyStopping = earlyStopping, + findLR = findLR, + metric = metric, + seed = seed[1]) + + optimizer <- rlang::enquo(optimizer) estimatorSettings$optimizer <- function() rlang::eval_tidy(optimizer) - class(estimatorSettings$optimizer) <- c("delayed", class(estimatorSettings$optimizer)) - + class(estimatorSettings$optimizer) <- c("delayed", + class(estimatorSettings$optimizer)) + criterion <- rlang::enquo(criterion) estimatorSettings$criterion <- function() rlang::eval_tidy(criterion) - class(estimatorSettings$criterion) <- c("delayed", class(estimatorSettings$criterion)) - + class(estimatorSettings$criterion) <- c("delayed", + class(estimatorSettings$criterion)) + scheduler <- rlang::enquo(scheduler) estimatorSettings$scheduler <- function() rlang::eval_tidy(scheduler) - class(estimatorSettings$scheduler) <-c("delayed", class(estimatorSettings$scheduler)) + class(estimatorSettings$scheduler) <- c("delayed", + class(estimatorSettings$scheduler)) if (is.function(device)) { - class(estimatorSettings$device) <- c("delayed", class(estimatorSettings$device)) + class(estimatorSettings$device) <- c("delayed", + class(estimatorSettings$device)) } - - paramsToTune <- list() - for (name in names(estimatorSettings)) { - param <- estimatorSettings[[name]] - if (length(param) > 1 && is.atomic(param)) { - paramsToTune[[paste0('estimator.',name)]] <- param - } - if ("params" %in% names(param)) { - for (name2 in names(param[["params"]])) { - param2 <- param[["params"]][[name2]] - if (length(param2) > 1) { - paramsToTune[[paste0('estimator.',name,'.',name2)]] <- param2 - } - } - } - } - estimatorSettings$paramsToTune <- paramsToTune + estimatorSettings$paramsToTune <- extractParamsToTune(estimatorSettings) return(estimatorSettings) } - + #' fitEstimator #' #' @description @@ -137,12 +133,12 @@ fitEstimator <- function(trainData, analysisPath, ...) { start <- Sys.time() - + # check covariate data if (!FeatureExtraction::isCovariateData(trainData$covariateData)) { stop("Needs correct covariateData") } - + if (!is.null(trainData$folds)) { trainData$labels <- merge(trainData$labels, trainData$fold, by = "rowId") } @@ -150,9 +146,9 @@ fitEstimator <- function(trainData, covariateData = trainData$covariateData, cohort = trainData$labels ) - + covariateRef <- mappedCovariateData$covariateRef - + outLoc <- PatientLevelPrediction::createTempModelLoc() cvResult <- do.call( what = gridCvDeep, @@ -164,10 +160,12 @@ fitEstimator <- function(trainData, analysisPath = analysisPath ) ) - - hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary)) + + hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, + function(x) x$hyperSummary)) prediction <- cvResult$prediction - incs <- rep(1, covariateRef %>% dplyr::tally() %>% + incs <- rep(1, covariateRef %>% + dplyr::tally() %>% dplyr::collect() %>% as.integer()) covariateRef <- covariateRef %>% @@ -178,25 +176,30 @@ fitEstimator <- function(trainData, covariateValue = 0, isNumeric = .data$columnId %in% cvResult$numericalIndex ) - + comp <- start - Sys.time() result <- list( - model = cvResult$estimator, # file.path(outLoc), - + model = cvResult$estimator, + preprocessing = list( - featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, - tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, - requireDenseMatrix = F + featureEngineering = attr(trainData$covariateData, + "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, + "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = FALSE ), prediction = prediction, modelDesign = PatientLevelPrediction::createModelDesign( targetId = attr(trainData, "metaData")$targetId, outcomeId = attr(trainData, "metaData")$outcomeId, - restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, + restrictPlpDataSettings = attr(trainData, + "metaData")$restrictPlpDataSettings, covariateSettings = attr(trainData, "metaData")$covariateSettings, populationSettings = attr(trainData, "metaData")$populationSettings, - featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, - preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, + featureEngineeringSettings = attr(trainData$covariateData, + "metaData")$featureEngineeringSettings, + preprocessSettings = attr(trainData$covariateData, + "metaData")$preprocessSettings, modelSettings = modelSettings, splitSettings = attr(trainData, "metaData")$splitSettings, sampleSettings = attr(trainData, "metaData")$sampleSettings @@ -214,12 +217,12 @@ fitEstimator <- function(trainData, ), covariateImportance = covariateRef ) - + class(result) <- "plpModel" attr(result, "predictionFunction") <- "predictDeepEstimator" attr(result, "modelType") <- "binary" attr(result, "saveType") <- modelSettings$saveType - + return(result) } @@ -242,32 +245,31 @@ predictDeepEstimator <- function(plpModel, } if ("plpData" %in% class(data)) { mappedData <- PatientLevelPrediction::MapIds(data$covariateData, - cohort = cohort, - mapping = plpModel$covariateImportance %>% - dplyr::select( - "columnId", - "covariateId" - ) + cohort = cohort, + mapping = plpModel$covariateImportance %>% + dplyr::select("columnId", "covariateId") ) - data <- createDataset(mappedData, plpModel=plpModel) + data <- createDataset(mappedData, plpModel = plpModel) } - + # get predictions prediction <- cohort if (is.character(plpModel$model)) { modelSettings <- plpModel$modelDesign$modelSettings - model <- torch$load(file.path(plpModel$model, "DeepEstimatorModel.pt"), map_location = "cpu") - estimator <- createEstimator(modelType=modelSettings$modelType, - modelParameters=model$model_parameters, - estimatorSettings=model$estimator_settings) + model <- torch$load(file.path(plpModel$model, + "DeepEstimatorModel.pt"), + map_location = "cpu") + estimator <- createEstimator(modelType = modelSettings$modelType, + modelParameters = model$model_parameters, + estimatorSettings = model$estimator_settings) estimator$model$load_state_dict(model$model_state_dict) prediction$value <- estimator$predict_proba(data) } else { prediction$value <- plpModel$model$predict_proba(data) } - + prediction$value <- as.numeric(prediction$value) attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType") - + return(prediction) } @@ -289,15 +291,16 @@ gridCvDeep <- function(mappedData, modelSettings, modelLocation, analysisPath) { - ParallelLogger::logInfo(paste0("Running hyperparameter search for ", modelSettings$modelType, " model")) - + ParallelLogger::logInfo(paste0("Running hyperparameter search for ", + modelSettings$modelType, " model")) + ########################################################################### - + paramSearch <- modelSettings$param - + # TODO below chunk should be in a setupCache function - trainCache <- TrainingCache$new(analysisPath) - if (trainCache$isParamGridIdentical(paramSearch)) { + trainCache <- trainingCache$new(analysisPath) + if (trainCache$isParamGridIdentical(paramSearch)) { gridSearchPredictons <- trainCache$getGridSearchPredictions() } else { gridSearchPredictons <- list() @@ -305,123 +308,114 @@ gridCvDeep <- function(mappedData, trainCache$saveGridSearchPredictions(gridSearchPredictons) trainCache$saveModelParams(paramSearch) } - - dataset <- createDataset(data=mappedData, labels=labels) - - fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))] + + dataset <- createDataset(data = mappedData, labels = labels) + + fitParams <- names(paramSearch[[1]])[grepl("^estimator", + names(paramSearch[[1]]))] findLR <- modelSettings$estimatorSettings$findLR if (!trainCache$isFull()) { for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) { - ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId)) - ParallelLogger::logInfo(paste0("HyperParameters: ")) - ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | ")) - currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames] - - currentEstimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, - paramSearch[[gridId]]) - - # initiate prediction - prediction <- NULL - - fold <- labels$index - ParallelLogger::logInfo(paste0("Max fold: ", max(fold))) - currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]] - currentModelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]] - if (findLR) { - LrFinder <- createLRFinder(modelType = modelSettings$modelType, - modelParameters = currentModelParams, - estimatorSettings = currentEstimatorSettings - ) - lr <- LrFinder$get_lr(dataset) - ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr)) - currentEstimatorSettings$learningRate <- lr - } - - learnRates <- list() - for (i in 1:max(fold)) { - ParallelLogger::logInfo(paste0("Fold ", i)) - trainDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold != i) - 1)) # -1 for python 0-based indexing - testDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold == i) -1)) # -1 for python 0-based indexing - - estimator <- createEstimator(modelType=modelSettings$modelType, - modelParameters=currentModelParams, - estimatorSettings=currentEstimatorSettings) - estimator$fit(trainDataset, testDataset) - - ParallelLogger::logInfo("Calculating predictions on left out fold set...") - - prediction <- rbind( - prediction, - predictDeepEstimator( - plpModel = estimator, - data = testDataset, - cohort = labels[fold == i, ] + ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", + gridId)) + ParallelLogger::logInfo(paste0("HyperParameters: ")) + ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), + paramSearch[[gridId]], collapse = " | ")) + currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames] + + currentEstimatorSettings <- + fillEstimatorSettings(modelSettings$estimatorSettings, + fitParams, + paramSearch[[gridId]]) + currentEstimatorSettings$modelType <- modelSettings$modelType + currentModelParams$catFeatures <- dataset$get_cat_features()$max() + currentModelParams$numFeatures <- + dataset$get_numerical_features()$max() + if (findLR) { + lrFinder <- createLRFinder(modelType = modelSettings$modelType, + modelParameters = currentModelParams, + estimatorSettings = currentEstimatorSettings ) + lr <- lrFinder$get_lr(dataset) + ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr)) + currentEstimatorSettings$learningRate <- lr + } + + crossValidationResults <- + doCrossvalidation(dataset, + labels = labels, + modelSettings = currentModelParams, + estimatorSettings = currentEstimatorSettings) + learnRates <- crossValidationResults$learnRates + prediction <- crossValidationResults$prediction + + gridPerformance <- + PatientLevelPrediction::computeGridPerformance(prediction, + paramSearch[[gridId]]) + maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) + gridSearchPredictons[[gridId]] <- list( + prediction = prediction, + param = paramSearch[[gridId]], + gridPerformance = gridPerformance ) - learnRates[[i]] <- list( - LRs = estimator$learn_rate_schedule, - bestEpoch = estimator$best_epoch - ) - } - maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) - gridSearchPredictons[[gridId]] <- list( - prediction = prediction, - param = paramSearch[[gridId]], - gridPerformance = PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]]) - ) - gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), - nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary)) - gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]] - - - # remove all predictions that are not the max performance - indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) - for (i in seq_along(gridSearchPredictons)) { - if (!is.null(gridSearchPredictons[[i]])) { - if (i != indexOfMax) { + gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- + rep(list(unlist(learnRates[[maxIndex]]$LRs)), + nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary)) + gridSearchPredictons[[gridId]]$param$learnSchedule <- + learnRates[[maxIndex]] + # remove all predictions that are not the max performance + indexOfMax <- + which.max(unlist(lapply(gridSearchPredictons, + function(x) x$gridPerformance$cvPerformance))) + for (i in seq_along(gridSearchPredictons)) { + if (!is.null(gridSearchPredictons[[i]]) && i != indexOfMax) { gridSearchPredictons[[i]]$prediction <- list(NULL) } } + ParallelLogger::logInfo(paste0("Caching all grid search results and + prediction for best combination ", + indexOfMax)) + trainCache$saveGridSearchPredictions(gridSearchPredictons) } - ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax)) - trainCache$saveGridSearchPredictions(gridSearchPredictons) - } } paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) # get best params - indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance))) + indexOfMax <- + which.max(unlist(lapply(gridSearchPredictons, + function(x) x$gridPerformance$cvPerformance))) finalParam <- gridSearchPredictons[[indexOfMax]]$param paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance) - + # get best CV prediction cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction cvPrediction$evaluationType <- "CV" - + ParallelLogger::logInfo("Training final model using optimal parameters") # get the params modelParams <- finalParam[modelSettings$modelParamNames] - - + + # create the dir if (!dir.exists(file.path(modelLocation))) { - dir.create(file.path(modelLocation), recursive = T) + dir.create(file.path(modelLocation), recursive = TRUE) } - - modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]] - modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]] - - - estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, + + modelParams$catFeatures <- dataset$get_cat_features()$max() + modelParams$numFeatures <- dataset$get_numerical_features()$max() + + + estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, + fitParams, finalParam) estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]] estimator <- createEstimator(modelType = modelSettings$modelType, modelParameters = modelParams, estimatorSettings = estimatorSettings) - + numericalIndex <- dataset$get_numerical_features() estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs) - + ParallelLogger::logInfo("Calculating predictions on all train data...") prediction <- predictDeepEstimator( plpModel = estimator, @@ -429,7 +423,7 @@ gridCvDeep <- function(mappedData, cohort = labels ) prediction$evaluationType <- "Train" - + prediction <- rbind( prediction, cvPrediction @@ -438,9 +432,10 @@ gridCvDeep <- function(mappedData, prediction <- prediction %>% dplyr::select(-"index") - prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, origin = "1970-01-01") - - + prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, + origin = "1970-01-01") + + # save torch code here estimator$save(modelLocation, "DeepEstimatorModel.pt") return( @@ -454,23 +449,25 @@ gridCvDeep <- function(mappedData, ) } -# utility function to add instances of parameters to estimatorSettings during grid search +# utility function to add instances of parameters to estimatorSettings +# during grid search fillEstimatorSettings <- function(estimatorSettings, fitParams, paramSearch) { for (fp in fitParams) { components <- strsplit(fp, "[.]")[[1]] - - if (length(components)==2) { + + if (length(components) == 2) { estimatorSettings[[components[[2]]]] <- paramSearch[[fp]] } else { - estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <- paramSearch[[fp]] + estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <- + paramSearch[[fp]] } } return(estimatorSettings) } -# utility function to evaluate any expressions or call functions passed as settings +# utility function to evaluate any expressions or call functions passed as +# settings evalEstimatorSettings <- function(estimatorSettings) { - for (set in names(estimatorSettings)) { if (inherits(estimatorSettings[[set]], "delayed")) { estimatorSettings[[set]] <- estimatorSettings[[set]]() @@ -484,16 +481,79 @@ createEstimator <- function(modelType, estimatorSettings) { path <- system.file("python", package = "DeepPatientLevelPrediction") - Model <- reticulate::import_from_path(modelType, path=path)[[modelType]] - - Estimator <- reticulate::import_from_path("Estimator", path=path)$Estimator - + model <- reticulate::import_from_path(modelType, path = path)[[modelType]] + estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator + modelParameters <- camelCaseToSnakeCaseNames(modelParameters) estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings) estimatorSettings <- evalEstimatorSettings(estimatorSettings) - - estimator <- Estimator(model = Model, + + estimator <- estimator(model = model, model_parameters = modelParameters, estimator_settings = estimatorSettings) return(estimator) -} \ No newline at end of file +} + +doCrossvalidation <- function(dataset, + labels, + modelSettings, + estimatorSettings) { + fold <- labels$index + ParallelLogger::logInfo(paste0("Max fold: ", max(fold))) + learnRates <- list() + prediction <- NULL + for (i in 1:max(fold)) { + ParallelLogger::logInfo(paste0("Fold ", i)) + + # -1 for python 0-based indexing + trainDataset <- torch$utils$data$Subset(dataset, + indices = + as.integer(which(fold != i) - 1)) + + # -1 for python 0-based indexing + testDataset <- torch$utils$data$Subset(dataset, + indices = + as.integer(which(fold == i) - 1)) + estimator <- createEstimator(modelType = estimatorSettings$modelType, + modelParameters = modelSettings, + estimatorSettings = estimatorSettings) + estimator$fit(trainDataset, testDataset) + + ParallelLogger::logInfo("Calculating predictions on left out fold set...") + + prediction <- rbind( + prediction, + predictDeepEstimator( + plpModel = estimator, + data = testDataset, + cohort = labels[fold == i, ] + ) + ) + learnRates[[i]] <- list( + LRs = estimator$learn_rate_schedule, + bestEpoch = estimator$best_epoch + ) + } + return(results = list(prediction = prediction, + learnRates = learnRates)) + +} + +extractParamsToTune <- function(estimatorSettings) { + paramsToTune <- list() + for (name in names(estimatorSettings)) { + param <- estimatorSettings[[name]] + if (length(param) > 1 && is.atomic(param)) { + paramsToTune[[paste0("estimator.", name)]] <- param + } + if ("params" %in% names(param)) { + for (name2 in names(param[["params"]])) { + param2 <- param[["params"]][[name2]] + if (length(param2) > 1) { + paramsToTune[[paste0("estimator.", name, ".", name2)]] <- param2 + } + } + } + } + return(paramsToTune) +} diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R index 350baf9..a1aa43c 100644 --- a/R/HelperFunctions.R +++ b/R/HelperFunctions.R @@ -41,40 +41,40 @@ camelCaseToSnakeCaseNames <- function(object) { } #' helper function to check class of input -#' +#' #' @param parameter the input parameter to check #' @param classes which classes it should belong to (one or more) -checkIsClass<- function(parameter,classes) { - name = deparse(substitute(parameter)) +checkIsClass <- function(parameter, classes) { + name <- deparse(substitute(parameter)) if (!inherits(x = parameter, what = classes)) { - ParallelLogger::logError(paste0(name, ' should be of class:', classes, ' ')) - stop(paste0(name, ' is wrong class')) + ParallelLogger::logError(paste0(name, " should be of class:", classes, " ")) + stop(paste0(name, " is wrong class")) } return(TRUE) } #' helper function to check that input is higher than a certain value -#' +#' #' @param parameter the input parameter to check, can be a vector #' @param value which value it should be higher than -checkHigher <- function(parameter,value) { - name = deparse(substitute(parameter)) - if (!is.numeric(parameter) | all(parameter<=value)) { - ParallelLogger::logError(paste0(name, ' needs to be > ',value)) - stop(paste0(name, ' needs to be > ', value)) +checkHigher <- function(parameter, value) { + name <- deparse(substitute(parameter)) + if (!is.numeric(parameter) || all(parameter == value)) { + ParallelLogger::logError(paste0(name, " needs to be > ", value)) + stop(paste0(name, " needs to be > ", value)) } return(TRUE) } #' helper function to check that input is higher or equal than a certain value -#' +#' #' @param parameter the input parameter to check, can be a vector #' @param value which value it should be higher or equal than -checkHigherEqual <- function(parameter,value) { - name = deparse(substitute(parameter)) - if (!is.numeric(parameter) | all(parameter= ',value)) - stop(paste0(name, ' needs to be >= ', value)) +checkHigherEqual <- function(parameter, value) { + name <- deparse(substitute(parameter)) + if (!is.numeric(parameter) || all(parameter < value)) { + ParallelLogger::logError(paste0(name, " needs to be >= ", value)) + stop(paste0(name, " needs to be >= ", value)) } return(TRUE) -} \ No newline at end of file +} diff --git a/R/LRFinder.R b/R/LRFinder.R index dfe8ee3..4ab006c 100644 --- a/R/LRFinder.R +++ b/R/LRFinder.R @@ -18,22 +18,26 @@ createLRFinder <- function(modelType, modelParameters, estimatorSettings, - lrSettings=NULL) { + lrSettings = NULL) { path <- system.file("python", package = "DeepPatientLevelPrediction") - LRFinderClass <- reticulate::import_from_path("LrFinder", path = path)$LrFinder + lrFinderClass <- + reticulate::import_from_path("LrFinder", path = path)$LrFinder - model <- reticulate::import_from_path(modelType, path=path)[[modelType]] + + model <- reticulate::import_from_path(modelType, path = path)[[modelType]] modelParameters <- camelCaseToSnakeCaseNames(modelParameters) estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings) + estimatorSettings <- evalEstimatorSettings(estimatorSettings) + + # estimator <- createEstimator(modelType = estimatorSettings$modelType, + # modelParameters = modelParameters, + # estimatorSettings = estimatorSettings) if (!is.null(lrSettings)) { lrSettings <- camelCaseToSnakeCaseNames(lrSettings) } - estimatorSettings <- evalEstimatorSettings(estimatorSettings) - - - lrFinder <- LRFinderClass(model=model, + lrFinder <- lrFinderClass(model = model, model_parameters = modelParameters, estimator_settings = estimatorSettings, lr_settings = lrSettings) diff --git a/R/MLP.R b/R/MLP.R index 6ad35e6..771e244 100644 --- a/R/MLP.R +++ b/R/MLP.R @@ -26,12 +26,15 @@ #' #' #' @param numLayers Number of layers in network, default: 1:8 -#' @param sizeHidden Amount of neurons in each default layer, default: 2^(6:9) (64 to 512) -#' @param dropout How much dropout to apply after first linear, default: seq(0, 0.3, 0.05) -#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) (64 to 512) +#' @param sizeHidden Amount of neurons in each default layer, +#' default: 2^(6:9) (64 to 512) +#' @param dropout How much dropout to apply after first linear, +#' default: seq(0, 0.3, 0.05) +#' @param sizeEmbedding Size of embedding default: 2^(6:9) (64 to 512) #' @param estimatorSettings settings of Estimator created with `setEstimator` -#' @param hyperParamSearch Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random' -#' @param randomSample How many random samples from hyperparameter space to use +#' @param hyperParamSearch Which kind of hyperparameter search to use random +#' sampling or exhaustive grid search. default: 'random' +#' @param randomSample How many random samples from hyperparameter space to use #' @param randomSampleSeed Random seed to sample hyperparameter combinations #' #' @export @@ -39,35 +42,34 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8), sizeHidden = c(2^(6:9)), dropout = c(seq(0, 0.3, 0.05)), sizeEmbedding = c(2^(6:9)), - estimatorSettings = setEstimator( - learningRate = 'auto', - weightDecay = c(1e-6, 1e-3), - batchSize = 1024, - epochs = 30, - device="cpu"), + estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = c(1e-6, 1e-3), + batchSize = 1024, + epochs = 30, + device = "cpu"), hyperParamSearch = "random", randomSample = 100, randomSampleSeed = NULL) { - checkIsClass(numLayers, c("integer", "numeric")) checkHigherEqual(numLayers, 1) checkIsClass(sizeHidden, c("integer", "numeric")) checkHigherEqual(sizeHidden, 1) - + checkIsClass(dropout, c("numeric")) checkHigherEqual(dropout, 0) - + checkIsClass(sizeEmbedding, c("numeric", "integer")) checkHigherEqual(sizeEmbedding, 1) - + checkIsClass(hyperParamSearch, "character") - + checkIsClass(randomSample, c("numeric", "integer")) checkHigherEqual(randomSample, 1) - + checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL")) - + paramGrid <- list( numLayers = numLayers, sizeHidden = sizeHidden, @@ -78,16 +80,20 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8), paramGrid <- c(paramGrid, estimatorSettings$paramsToTune) param <- PatientLevelPrediction::listCartesian(paramGrid) - if (hyperParamSearch == "random" && randomSample>length(param)) { - stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", - "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param), + if (hyperParamSearch == "random" && randomSample > length(param)) { + stop(paste("\n Chosen amount of randomSamples is higher than the + amount of possible hyperparameter combinations.", + "\n randomSample:", randomSample, "\n Possible hyperparameter + combinations:", length(param), "\n Please lower the amount of randomSamples")) } if (hyperParamSearch == "random") { - suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]})) + suppressWarnings(withr::with_seed(randomSampleSeed, + {param <- param[sample(length(param), + randomSample)]})) } - attr(param, 'settings')$modelType <- "MLP" + attr(param, "settings")$modelType <- "MLP" results <- list( fitFunction = "fitEstimator", diff --git a/R/ResNet.R b/R/ResNet.R index d20aa1d..32e7f3a 100644 --- a/R/ResNet.R +++ b/R/ResNet.R @@ -22,18 +22,19 @@ #' Creates settings for a default ResNet model #' #' @details -#' Model architecture from by https://arxiv.org/abs/2106.11959 . +#' Model architecture from by https://arxiv.org/abs/2106.11959 . #' Hyperparameters chosen by a experience on a few prediction problems. #' #' @param estimatorSettings created with ```setEstimator``` #' @export -setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto', - weightDecay=1e-6, - device='cpu', - batchSize=1024, - epochs=50, - seed=NULL)) { +setDefaultResNet <- function(estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = 1e-6, + device = "cpu", + batchSize = 1024, + epochs = 50, + seed = NULL)) { resnetSettings <- setResNet(numLayers = 6, sizeHidden = 512, hiddenFactor = 2, @@ -41,9 +42,9 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto', hiddenDropout = 0.4, sizeEmbedding = 256, estimatorSettings = estimatorSettings, - hyperParamSearch = 'random', + hyperParamSearch = "random", randomSample = 1) - attr(resnetSettings, 'settings')$name <- 'defaultResnet' + attr(resnetSettings, "settings")$name <- "defaultResnet" return(resnetSettings) } @@ -58,14 +59,21 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto', #' #' #' @param numLayers Number of layers in network, default: 1:16 -#' @param sizeHidden Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024) -#' @param hiddenFactor How much to grow the amount of neurons in each ResLayer, default: 1:4 -#' @param residualDropout How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05) -#' @param hiddenDropout How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05) -#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) (64 to 512) +#' @param sizeHidden Amount of neurons in each default layer, default: +#' 2^(6:10) (64 to 1024) +#' @param hiddenFactor How much to grow the amount of neurons in each +#' ResLayer, default: 1:4 +#' @param residualDropout How much dropout to apply after last linear layer +#' in ResLayer, default: seq(0, 0.3, 0.05) +#' @param hiddenDropout How much dropout to apply after first linear layer +#' in ResLayer, default: seq(0, 0.3, 0.05) +#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) +#' '(64 to 512) #' @param estimatorSettings created with ```setEstimator``` -#' @param hyperParamSearch Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random' -#' @param randomSample How many random samples from hyperparameter space to use +#' @param hyperParamSearch Which kind of hyperparameter search to use random +#' sampling or exhaustive grid search. default: 'random' +#' @param randomSample How many random samples from hyperparameter space +#' to use #' @param randomSampleSeed Random seed to sample hyperparameter combinations #' @export setResNet <- function(numLayers = c(1:8), @@ -74,61 +82,62 @@ setResNet <- function(numLayers = c(1:8), residualDropout = c(seq(0, 0.5, 0.05)), hiddenDropout = c(seq(0, 0.5, 0.05)), sizeEmbedding = c(2^(6:9)), - estimatorSettings = setEstimator(learningRate='auto', - weightDecay=c(1e-6, 1e-3), - device='cpu', - batchSize=1024, - epochs=30, - seed=NULL), + estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = c(1e-6, 1e-3), + device = "cpu", + batchSize = 1024, + epochs = 30, + seed = NULL), hyperParamSearch = "random", randomSample = 100, - randomSampleSeed = NULL) -{ + randomSampleSeed = NULL) { checkIsClass(numLayers, c("integer", "numeric")) checkHigherEqual(numLayers, 1) - + checkIsClass(sizeHidden, c("integer", "numeric")) checkHigherEqual(sizeHidden, 1) - + checkIsClass(residualDropout, "numeric") checkHigherEqual(residualDropout, 0) - + checkIsClass(hiddenDropout, "numeric") checkHigherEqual(hiddenDropout, 0) - + checkIsClass(sizeEmbedding, c("integer", "numeric")) checkHigherEqual(sizeEmbedding, 1) - + checkIsClass(hyperParamSearch, "character") - + checkIsClass(randomSample, c("numeric", "integer")) checkHigherEqual(randomSample, 1) - + checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL")) - - paramGrid <- list( - numLayers = numLayers, - sizeHidden = sizeHidden, - hiddenFactor = hiddenFactor, - residualDropout = residualDropout, - hiddenDropout = hiddenDropout, - sizeEmbedding = sizeEmbedding) - + + paramGrid <- list(numLayers = numLayers, + sizeHidden = sizeHidden, + hiddenFactor = hiddenFactor, + residualDropout = residualDropout, + hiddenDropout = hiddenDropout, + sizeEmbedding = sizeEmbedding) paramGrid <- c(paramGrid, estimatorSettings$paramsToTune) - + param <- PatientLevelPrediction::listCartesian(paramGrid) - if (hyperParamSearch == "random" && randomSample>length(param)) { - stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", - "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param), - "\n Please lower the amount of randomSamples")) + if (hyperParamSearch == "random" && randomSample > length(param)) { + stop(paste("\n Chosen amount of randomSamples is higher than the amount of + possible hyperparameter combinations.", "\n randomSample:", + randomSample, "\n Possible hyperparameter combinations:", + length(param), "\n Please lower the amount of randomSamples")) } - + if (hyperParamSearch == "random") { - suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]})) + suppressWarnings(withr::with_seed(randomSampleSeed, + {param <- param[sample(length(param), + randomSample)]})) } - attr(param, 'settings')$modelType <- "ResNet" + attr(param, "settings")$modelType <- "ResNet" results <- list( fitFunction = "fitEstimator", param = param, diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R index be626d5..fe7dfc7 100644 --- a/R/TrainingCache-class.R +++ b/R/TrainingCache-class.R @@ -2,9 +2,8 @@ #' @description #' Parameter caching for training persistence and continuity #' @export -TrainingCache <- R6::R6Class( +trainingCache <- R6::R6Class( "TrainingCache", - private = list( .paramPersistence = list( gridSearchPredictions = NULL, @@ -12,30 +11,27 @@ TrainingCache <- R6::R6Class( ), .paramContinuity = list(), .saveDir = NULL, - writeToFile = function() { saveRDS(private$.paramPersistence, file.path(private$.saveDir)) }, - readFromFile = function() { - private$.paramPersistence <- readRDS(file.path(private$.saveDir)) + private$.paramPersistence <- readRDS(file.path(private$.saveDir)) } ), - public = list( #' @description #' Creates a new training cache #' @param inDir Path to the analysis directory initialize = function(inDir) { private$.saveDir <- file.path(inDir, "paramPersistence.rds") - + if (file.exists(private$.saveDir)) { private$readFromFile() } else { private$writeToFile() } }, - + #' @description #' Checks whether the parameter grid in the model settings is identical to #' the cached parameters. @@ -44,7 +40,7 @@ TrainingCache <- R6::R6Class( isParamGridIdentical = function(inModelParams) { return(identical(inModelParams, private$.paramPersistence$modelParams)) }, - + #' @description #' Saves the grid search results to the training cache #' @param inGridSearchPredictions Grid search predictions @@ -53,7 +49,7 @@ TrainingCache <- R6::R6Class( inGridSearchPredictions private$writeToFile() }, - + #' @description #' Saves the parameter grid to the training cache #' @param inModelParams Parameter grid from the model settings @@ -61,20 +57,21 @@ TrainingCache <- R6::R6Class( private$.paramPersistence$modelParams <- inModelParams private$writeToFile() }, - + #' @description #' Gets the grid search results from the training cache #' @returns Grid search results from the training cache getGridSearchPredictions = function() { return(private$.paramPersistence$gridSearchPredictions) }, - + #' @description #' Check if cache is full #' @returns Boolen isFull = function() { - return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance))))) - }, + return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, + function(x) !is.null(x$gridPerformance))))) + }, #' @description #' Gets the last index from the cached grid search @@ -88,11 +85,11 @@ TrainingCache <- R6::R6Class( return(1) } else { return(which(sapply(private$.paramPersistence$gridSearchPredictions, - is.null))[1]) + is.null))[1]) } } }, - + #' @description #' Remove the training cache from the analysis path dropCache = function() { diff --git a/R/Transformer.R b/R/Transformer.R index 8b0d0c4..cd4a968 100644 --- a/R/Transformer.R +++ b/R/Transformer.R @@ -24,13 +24,13 @@ #' @param estimatorSettings created with `setEstimator` #' #' @export -setDefaultTransformer <- function(estimatorSettings=setEstimator( - learningRate = 'auto', - weightDecay = 1e-4, - batchSize=512, - epochs=10, - seed=NULL, - device='cpu') +setDefaultTransformer <- function(estimatorSettings = + setEstimator(learningRate = "auto", + weightDecay = 1e-4, + batchSize = 512, + epochs = 10, + seed = NULL, + device = "cpu") ) { transformerSettings <- setTransformer(numBlocks = 3, dimToken = 192, @@ -40,10 +40,10 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator( ffnDropout = 0.1, resDropout = 0.0, dimHidden = 256, - estimatorSettings=estimatorSettings, - hyperParamSearch = 'random', + estimatorSettings = estimatorSettings, + hyperParamSearch = "random", randomSample = 1) - attr(transformerSettings, 'settings')$name <- 'defaultTransformer' + attr(transformerSettings, "settings")$name <- "defaultTransformer" return(transformerSettings) } @@ -54,75 +54,81 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator( #' #' @param numBlocks number of transformer blocks #' @param dimToken dimension of each token (embedding size) -#' @param dimOut dimension of output, usually 1 for binary problems +#' @param dimOut dimension of output, usually 1 for binary +#' problems #' @param numHeads number of attention heads #' @param attDropout dropout to use on attentions #' @param ffnDropout dropout to use in feedforward block #' @param resDropout dropout to use in residual connections #' @param dimHidden dimension of the feedworward block -#' @param dimHiddenRatio dimension of the feedforward block as a ratio of dimToken (embedding size) +#' @param dimHiddenRatio dimension of the feedforward block as a ratio +#' of dimToken (embedding size) #' @param estimatorSettings created with `setEstimator` -#' @param hyperParamSearch what kind of hyperparameter search to do, default 'random' -#' @param randomSample How many samples to use in hyperparameter search if random -#' @param randomSampleSeed Random seed to sample hyperparameter combinations +#' @param hyperParamSearch what kind of hyperparameter search to do, +#' default 'random' +#' @param randomSample How many samples to use in hyperparameter +#' search if random +#' @param randomSampleSeed Random seed to sample hyperparameter +#' combinations #' #' @export -setTransformer <- function(numBlocks = 3, - dimToken = 96, +setTransformer <- function(numBlocks = 3, + dimToken = 96, dimOut = 1, - numHeads = 8, - attDropout = 0.25, + numHeads = 8, + attDropout = 0.25, ffnDropout = 0.25, - resDropout = 0, - dimHidden = 512, + resDropout = 0, + dimHidden = 512, dimHiddenRatio = NULL, - estimatorSettings=setEstimator(weightDecay = 1e-6, - batchSize=1024, - epochs=10, - seed=NULL), + estimatorSettings = setEstimator(weightDecay = 1e-6, + batchSize = 1024, + epochs = 10, + seed = NULL), hyperParamSearch = "random", - randomSample = 1, + randomSample = 1, randomSampleSeed = NULL) { - + checkIsClass(numBlocks, c("integer", "numeric")) checkHigherEqual(numBlocks, 1) - + checkIsClass(dimToken, c("integer", "numeric")) checkHigherEqual(dimToken, 1) - + checkIsClass(dimOut, c("integer", "numeric")) checkHigherEqual(dimOut, 1) - + checkIsClass(numHeads, c("integer", "numeric")) checkHigherEqual(numHeads, 1) checkIsClass(attDropout, c("numeric")) checkHigherEqual(attDropout, 0) - + checkIsClass(ffnDropout, c("numeric")) checkHigherEqual(ffnDropout, 0) - + checkIsClass(resDropout, c("numeric")) checkHigherEqual(resDropout, 0) - + checkIsClass(dimHidden, c("integer", "numeric", "NULL")) if (!is.null(dimHidden)) { checkHigherEqual(dimHidden, 1) } - + checkIsClass(dimHiddenRatio, c("numeric", "NULL")) if (!is.null(dimHiddenRatio)) { checkHigher(dimHiddenRatio, 0) } - + checkIsClass(hyperParamSearch, "character") - + checkIsClass(randomSample, c("numeric", "integer")) checkHigherEqual(randomSample, 1) - + checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL")) - - if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads), dimToken %% numHeads != 0))) { + + if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads), + dimToken %% numHeads != 0))) { stop(paste( "dimToken needs to divisible by numHeads. dimToken =", dimToken, "is not divisible by numHeads =", numHeads @@ -157,21 +163,24 @@ setTransformer <- function(numBlocks = 3, if (!is.null(dimHiddenRatio)) { param <- lapply(param, function(x) { - x$dimHidden <- round(x$dimToken*x$dimHidden, digits = 0) + x$dimHidden <- round(x$dimToken * x$dimHidden, digits = 0) return(x) }) } - - if (hyperParamSearch == "random" && randomSample>length(param)) { - stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", - "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param), - "\n Please lower the amount of randomSample")) + + if (hyperParamSearch == "random" && randomSample > length(param)) { + stop(paste("\n Chosen amount of randomSamples is higher than the amount of + possible hyperparameter combinations.", "\n randomSample:", + randomSample, "\n Possible hyperparameter combinations:", + length(param), "\n Please lower the amount of randomSample")) } if (hyperParamSearch == "random") { - suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]})) + suppressWarnings(withr::with_seed(randomSampleSeed, + {param <- param[sample(length(param), + randomSample)]})) } - attr(param, 'settings')$modelType <- "Transformer" + attr(param, "settings")$modelType <- "Transformer" results <- list( fitFunction = "fitEstimator", param = param, @@ -186,4 +195,4 @@ setTransformer <- function(numBlocks = 3, class(results) <- "modelSettings" return(results) -} \ No newline at end of file +} diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py index 98d9ed3..b488520 100644 --- a/inst/python/Dataset.py +++ b/inst/python/Dataset.py @@ -8,29 +8,32 @@ class Data(Dataset): - def __init__(self, - data, - labels=None, - numerical_features=None): + def __init__(self, data, labels=None, numerical_features=None): """ data: path to a covariates dataframe either arrow dataset or sqlite object labels: a list of either 0 or 1, 1 if the patient got the outcome numerical_features: list of indices where the numerical features are """ start = time.time() - if pathlib.Path(data).suffix == '.sqlite': + if pathlib.Path(data).suffix == ".sqlite": data = urllib.parse.quote(data) - data = pl.read_database("SELECT * from covariates", - connection=f"sqlite://{data}").lazy() + data = pl.read_database( + "SELECT * from covariates", connection=f"sqlite://{data}" + ).lazy() else: - data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow')) - observations = data.select(pl.col('rowId').max()).collect()[0, 0] + data = pl.scan_ipc(pathlib.Path(data).joinpath("covariates/*.arrow")) + observations = data.select(pl.col("rowId").max()).collect()[0, 0] # detect features are numeric if numerical_features is None: - self.numerical_features = data.groupby(by='columnId') \ - .n_unique().filter(pl.col('covariateValue') > 1).select('columnId').collect()['columnId'] + self.numerical_features = ( + data.groupby(by="columnId") + .n_unique() + .filter(pl.col("covariateValue") > 1) + .select("columnId") + .collect()["columnId"] + ) else: - self.numerical_features = pl.Series('num', numerical_features) + self.numerical_features = pl.Series("num", numerical_features) if labels: self.target = torch.as_tensor(labels) @@ -42,42 +45,61 @@ def __init__(self, # create newColumnId from 1 (or zero?) until # catColumns # select rowId and newColumnId # rename newColumnId to columnId and sort by it - data_cat = data.filter(~pl.col('columnId') - .is_in(self.numerical_features)) \ - .sort(by='columnId').with_row_count('newColumnId'). \ - with_columns(pl.col('newColumnId').first().over('columnId') - .rank(method="dense")) \ - .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId')).sort('rowId') \ - .with_columns(pl.col('rowId') - 1).collect() + data_cat = ( + data.filter(~pl.col("columnId").is_in(self.numerical_features)) + .select(pl.col("rowId"), pl.col("columnId")) + .sort(["rowId", "columnId"]) + .with_columns(pl.col("rowId") - 1) + .collect() + ) cat_tensor = torch.as_tensor(data_cat.to_numpy()) - tensor_list = torch.split(cat_tensor[:, 1], torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1]. - tolist()) + tensor_list = torch.split( + cat_tensor[:, 1], + torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].tolist(), + ) # because of subjects without cat features, I need to create a list with all zeroes and then insert # my tensorList. That way I can still index the dataset correctly. total_list = [torch.as_tensor((0,))] * observations - idx = data_cat['rowId'].unique().to_list() + idx = data_cat["rowId"].unique().to_list() for i, i2 in enumerate(idx): total_list[i2] = tensor_list[i] self.cat = torch.nn.utils.rnn.pad_sequence(total_list, batch_first=True) - self.cat_features = data_cat['columnId'].unique() + self.cat_features = data_cat["columnId"].unique() # numerical data, # N x C, dense matrix with values for N patients/visits for C numerical features if pl.count(self.numerical_features) == 0: self.num = None else: - numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \ - with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId'). - rank(method="dense") - 1, pl.col('rowId') - 1) \ - .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect() - indices = torch.as_tensor(numerical_data.select(['rowId', 'columnId']).to_numpy(), dtype=torch.long) - values = torch.as_tensor(numerical_data.select('covariateValue').to_numpy(), dtype=torch.float) - self.num = torch.sparse_coo_tensor(indices=indices.T, - values=values.squeeze(), - size=(observations, pl.count(self.numerical_features))).to_dense() + map_numerical = dict(zip(self.numerical_features.sort().to_list(), + list(range(len(self.numerical_features))))) + + numerical_data = ( + data.filter(pl.col("columnId").is_in(self.numerical_features)) + .with_columns(pl.col("columnId").replace(map_numerical), + pl.col("rowId") - 1) + .select( + pl.col("rowId"), + pl.col("columnId"), + pl.col("covariateValue"), + ) + .collect() + ) + indices = torch.as_tensor( + numerical_data.select(["rowId", "columnId"]).to_numpy(), + dtype=torch.long, + ) + values = torch.as_tensor( + numerical_data.select("covariateValue").to_numpy(), dtype=torch.float + ) + self.num = torch.sparse_coo_tensor( + indices=indices.T, + values=values.squeeze(), + size=(observations, pl.count(self.numerical_features)), + ).to_dense() delta = time.time() - start - print(f'Processed data in {delta:.2f} seconds') + print(f"Processed data in {delta:.2f} seconds") def get_numerical_features(self): return self.numerical_features @@ -90,13 +112,15 @@ def __len__(self): def __getitem__(self, item): if self.num is not None: - batch = {"cat": self.cat[item, :], - "num": self.num[item, :]} + batch = {"cat": self.cat[item, :], "num": self.num[item, :]} else: - batch = {"cat": self.cat[item, :].squeeze(), - "num": None} + batch = {"cat": self.cat[item, :].squeeze(), "num": None} if batch["cat"].dim() == 1 and not isinstance(item, list): batch["cat"] = batch["cat"].unsqueeze(0) - if batch["num"] is not None and batch["num"].dim() == 1 and not isinstance(item, list): + if ( + batch["num"] is not None + and batch["num"].dim() == 1 + and not isinstance(item, list) + ): batch["num"] = batch["num"].unsqueeze(0) return [batch, self.target[item].squeeze()] diff --git a/inst/python/Estimator.py b/inst/python/Estimator.py index f8a182f..983bde6 100644 --- a/inst/python/Estimator.py +++ b/inst/python/Estimator.py @@ -5,7 +5,6 @@ from torch.utils.data import DataLoader, BatchSampler, RandomSampler, SequentialSampler import torch.nn.functional as F from tqdm import tqdm -from sklearn.metrics import roc_auc_score class Estimator: @@ -13,10 +12,7 @@ class Estimator: A class that wraps around pytorch models. """ - def __init__(self, - model, - model_parameters, - estimator_settings): + def __init__(self, model, model_parameters, estimator_settings): self.seed = estimator_settings["seed"] if callable(estimator_settings["device"]): self.device = estimator_settings["device"]() @@ -37,31 +33,51 @@ def __init__(self, self.previous_epochs = int(estimator_settings.get("previous_epochs", 0)) self.model.to(device=self.device) - self.optimizer = estimator_settings["optimizer"](params=self.model.parameters(), - lr=self.learning_rate, - weight_decay=self.weight_decay) + self.optimizer = estimator_settings["optimizer"]( + params=self.model.parameters(), + lr=self.learning_rate, + weight_decay=self.weight_decay, + ) self.criterion = estimator_settings["criterion"]() - if "metric" in estimator_settings.keys() and estimator_settings["metric"] is not None: + if ( + "metric" in estimator_settings.keys() + and estimator_settings["metric"] is not None + ): self.metric = estimator_settings["metric"] - if type(self.metric) == str: + if isinstance(self.metric, str): if self.metric == "auc": - self.metric = {"name": "auc", - "mode": "max"} + self.metric = {"name": "auc", "mode": "max"} elif self.metric == "loss": - self.metric = {"name": "loss", - "mode": "min"} - if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None: + self.metric = {"name": "loss", "mode": "min"} + if ( + "scheduler" in estimator_settings.keys() + and estimator_settings["scheduler"] is not None + ): estimator_settings["scheduler"]["params"]["mode"] = self.metric["mode"] - if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None: - estimator_settings["early_stopping"]["params"]["mode"] = self.metric["mode"] - - if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None: - self.scheduler = estimator_settings["scheduler"]["fun"](self.optimizer, - **estimator_settings["scheduler"]["params"]) - - if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None: - self.early_stopper = EarlyStopping(**estimator_settings["early_stopping"]["params"]) + if ( + "early_stopping" in estimator_settings.keys() + and estimator_settings["early_stopping"] is not None + ): + estimator_settings["early_stopping"]["params"]["mode"] = self.metric[ + "mode" + ] + + if ( + "scheduler" in estimator_settings.keys() + and estimator_settings["scheduler"] is not None + ): + self.scheduler = estimator_settings["scheduler"]["fun"]( + self.optimizer, **estimator_settings["scheduler"]["params"] + ) + + if ( + "early_stopping" in estimator_settings.keys() + and estimator_settings["early_stopping"] is not None + ): + self.early_stopper = EarlyStopping( + **estimator_settings["early_stopping"]["params"] + ) else: self.early_stopper = None @@ -70,21 +86,24 @@ def __init__(self, self.learn_rate_schedule = None def fit(self, dataset, test_dataset): - - train_dataloader = DataLoader(dataset=dataset, - batch_size=None, - sampler=BatchSampler( - sampler=RandomSampler(dataset), - batch_size=self.batch_size, - drop_last=True - )) - test_dataloader = DataLoader(dataset=test_dataset, - batch_size=None, - sampler=BatchSampler( - sampler=SequentialSampler(test_dataset), - batch_size=self.batch_size, - drop_last=False - )) + train_dataloader = DataLoader( + dataset=dataset, + batch_size=None, + sampler=BatchSampler( + sampler=RandomSampler(dataset), + batch_size=self.batch_size, + drop_last=True, + ), + ) + test_dataloader = DataLoader( + dataset=test_dataset, + batch_size=None, + sampler=BatchSampler( + sampler=SequentialSampler(test_dataset), + batch_size=self.batch_size, + drop_last=False, + ), + ) trained_epochs = dict() times = list() @@ -106,19 +125,25 @@ def fit(self, dataset, test_dataset): times.append(round(delta_time, 3)) if self.early_stopper: - self.early_stopper(scores['metric']) + self.early_stopper(scores["metric"]) if self.early_stopper.improved: model_state_dict[epoch] = self.model.state_dict() trained_epochs[epoch] = current_epoch if self.early_stopper.early_stop: print("Early stopping, validation metric stopped improving") - print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds') - self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates) + print( + f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds" + ) + self.finish_fit( + all_scores, model_state_dict, trained_epochs, learning_rates + ) return else: model_state_dict[epoch] = self.model.state_dict() trained_epochs[epoch] = current_epoch - print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds') + print( + f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds" + ) self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates) return @@ -155,8 +180,7 @@ def score(self, dataloader): mean_loss = loss.mean().item() predictions = torch.concat(predictions) targets = torch.concat(targets) - auc = roc_auc_score(targets.cpu(), predictions.cpu()) - # auc = compute_auc(predictions, targets) + auc = compute_auc(targets.cpu(), predictions.cpu()) scores = dict() if self.metric: if self.metric["name"] == "auc": @@ -172,48 +196,68 @@ def score(self, dataloader): def finish_fit(self, scores, model_state_dict, epoch, learning_rates): if self.metric["mode"] == "max": - best_epoch_index = torch.argmax(torch.as_tensor([x["metric"] for x in scores])).item() + best_epoch_index = torch.argmax( + torch.as_tensor([x["metric"] for x in scores]) + ).item() elif self.metric["mode"] == "min": - best_epoch_index = torch.argmin(torch.as_tensor([x["metric"] for x in scores])).item() + best_epoch_index = torch.argmin( + torch.as_tensor([x["metric"] for x in scores]) + ).item() best_model_state_dict = model_state_dict[best_epoch_index] self.model.load_state_dict(best_model_state_dict) self.best_epoch = epoch[best_epoch_index] - self.best_score = {"loss": scores[best_epoch_index]["loss"], - "auc": scores[best_epoch_index]["auc"]} - self.learn_rate_schedule = learning_rates[:(best_epoch_index+1)] + self.best_score = { + "loss": scores[best_epoch_index]["loss"], + "auc": scores[best_epoch_index]["auc"], + } + self.learn_rate_schedule = learning_rates[: (best_epoch_index + 1)] print(f"Loaded best model (based on AUC) from epoch {self.best_epoch}") print(f"ValLoss: {self.best_score['loss']}") print(f"valAUC: {self.best_score['auc']}") - if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss": + if ( + self.metric + and self.metric["name"] != "auc" + and self.metric["name"] != "loss" + ): self.best_score[self.metric["name"]] = scores[best_epoch_index]["metric"] print(f"{self.metric['name']}: {self.best_score[self.metric['name']]}") return def print_progress(self, scores, training_loss, delta_time, current_epoch): - if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss": - print(f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} " - f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} " - f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds " - f"| LR: {self.optimizer.param_groups[0]['lr']}") + if ( + self.metric + and self.metric["name"] != "auc" + and self.metric["name"] != "loss" + ): + print( + f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} " + f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} " + f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds " + f"| LR: {self.optimizer.param_groups[0]['lr']}" + ) else: - print(f"Epochs: {current_epoch} " - f"| Val AUC: {scores['auc']:.3f} " - f"| Val Loss: {scores['loss']:.3f} " - f"| Train Loss: {training_loss:.3f} " - f"| Time: {delta_time:.3f} seconds " - f"| LR: {self.optimizer.param_groups[0]['lr']}") + print( + f"Epochs: {current_epoch} " + f"| Val AUC: {scores['auc']:.3f} " + f"| Val Loss: {scores['loss']:.3f} " + f"| Train Loss: {training_loss:.3f} " + f"| Time: {delta_time:.3f} seconds " + f"| LR: {self.optimizer.param_groups[0]['lr']}" + ) return def fit_whole_training_set(self, dataset, learning_rates=None): - dataloader = DataLoader(dataset=dataset, - batch_size=None, - sampler=BatchSampler( - sampler=RandomSampler(dataset), - batch_size=self.batch_size, - drop_last=True - )) + dataloader = DataLoader( + dataset=dataset, + batch_size=None, + sampler=BatchSampler( + sampler=RandomSampler(dataset), + batch_size=self.batch_size, + drop_last=True, + ), + ) if isinstance(learning_rates, list): self.best_epoch = len(learning_rates) elif ~isinstance(learning_rates, list): @@ -223,7 +267,7 @@ def fit_whole_training_set(self, dataset, learning_rates=None): self.best_epoch = self.epochs for epoch in range(self.best_epoch): - self.optimizer.param_groups[0]['lr'] = learning_rates[epoch] + self.optimizer.param_groups[0]["lr"] = learning_rates[epoch] self.fit_epoch(dataloader) return @@ -233,19 +277,21 @@ def save(self, path, name): model_state_dict=self.model.state_dict(), model_parameters=self.model_parameters, estimator_settings=self.estimator_settings, - epoch=self.epochs) - torch.save(out, - f=save_path) + epoch=self.epochs, + ) + torch.save(out, f=save_path) return save_path def predict_proba(self, dataset): - dataloader = DataLoader(dataset=dataset, - batch_size=None, - sampler=BatchSampler( - sampler=SequentialSampler(dataset), - batch_size=self.batch_size, - drop_last=False - )) + dataloader = DataLoader( + dataset=dataset, + batch_size=None, + sampler=BatchSampler( + sampler=SequentialSampler(dataset), + batch_size=self.batch_size, + drop_last=False, + ), + ) with torch.no_grad(): predictions = list() self.model.eval() @@ -263,15 +309,11 @@ def predict(self, dataset, threshold=None): # use outcome rate threshold = dataset.target.sum().item() / len(dataset) predicted_class = predictions > threshold + return predicted_class class EarlyStopping: - - def __init__(self, - patience=3, - delta=0, - verbose=True, - mode='max'): + def __init__(self, patience=3, delta=0, verbose=True, mode="max"): self.patience = patience self.counter = 0 self.verbose = verbose @@ -282,9 +324,8 @@ def __init__(self, self.previous_score = 0 self.mode = mode - def __call__(self, - metric): - if self.mode == 'max': + def __call__(self, metric): + if self.mode == "max": score = metric else: score = -1 * metric @@ -295,8 +336,9 @@ def __call__(self, self.counter += 1 self.improved = False if self.verbose: - print(f"Early stopping counter: {self.counter}" - f" out of {self.patience}") + print( + f"Early stopping counter: {self.counter}" f" out of {self.patience}" + ) if self.counter >= self.patience: self.early_stop = True else: @@ -306,12 +348,12 @@ def __call__(self, self.previous_score = score -def batch_to_device(batch, device='cpu'): +def batch_to_device(batch, device="cpu"): if torch.is_tensor(batch): batch = batch.to(device=device) else: for ix, b in enumerate(batch): - if type(b) is str: + if isinstance(b, str): key = b b = batch[b] else: @@ -330,26 +372,26 @@ def batch_to_device(batch, device='cpu'): return batch -def compute_auc(input, target, n_threshold=1000): - threshold = torch.linspace(0, 1.0, n_threshold).to(device=input.device) - pred_label = input >= threshold[:, None, None] - input_target = pred_label * target +def compute_auc(y_true, y_pred): + """ + Computes the AUC score for binary classification predictions with a fast algorithm. + Args: + y_true (torch.Tensor): True binary labels. + y_pred (torch.Tensor): Predicted scores. + Returns: + float: Computed AUC score. + """ + # Ensure inputs are sorted by predicted score + _, sorted_indices = torch.sort(y_pred, descending=True) + y_true_sorted = y_true[sorted_indices] - cum_tp = F.pad(input_target.sum(dim=-1).rot90(1, [1, 0]), (1, 0), value=0.0) - cum_fp = F.pad( - (pred_label.sum(dim=-1) - input_target.sum(dim=-1)).rot90(1, [1, 0]), - (1, 0), - value=0.0, - ) + # Get the number of positive and negative examples + n_pos = y_true_sorted.sum() + n_neg = (1 - y_true_sorted).sum() - if len(cum_tp.shape) > 1: - factor = cum_tp[:, -1] * cum_fp[:, -1] - else: - factor = cum_tp[-1] * cum_fp[-1] - # Set AUROC to 0.5 when the target contains all ones or all zeros. - auroc = torch.where( - factor == 0, - 0.5, - torch.trapz(cum_tp, cum_fp).double() / factor, - ) - return auroc.item() + # for every negative label, count preceding positive labels in sorted labels + num_crossings = torch.cumsum(y_true_sorted, 0)[y_true_sorted == 0].sum() + + # Compute AUC + auc = num_crossings / (n_pos * n_neg) + return auc diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py index 9d5bd0c..e7141cd 100644 --- a/inst/python/LrFinder.py +++ b/inst/python/LrFinder.py @@ -2,17 +2,13 @@ import torch from torch.optim.lr_scheduler import _LRScheduler -from torch.utils.data import DataLoader, BatchSampler, RandomSampler from tqdm import tqdm from Estimator import batch_to_device class ExponentialSchedulerPerBatch(_LRScheduler): - - def __init__(self, optimizer, - end_lr, - num_iter): + def __init__(self, optimizer, end_lr, num_iter): self.end_lr = end_lr self.num_iter = num_iter super(ExponentialSchedulerPerBatch, self).__init__(optimizer, last_epoch=-1) @@ -23,14 +19,9 @@ def get_lr(self): class LrFinder: - - def __init__(self, - model, - model_parameters, - estimator_settings, - lr_settings): + def __init__(self, model, model_parameters, estimator_settings, lr_settings): if lr_settings is None: - lr_settings = {} + lr_settings = {} min_lr = lr_settings.get("min_lr", 1e-7) max_lr = lr_settings.get("max_lr", 1) num_lr = lr_settings.get("num_lr", 100) @@ -50,13 +41,16 @@ def __init__(self, self.smooth = smooth self.divergence_threshold = divergence_threshold - self.optimizer = estimator_settings['optimizer'](params=self.model.parameters(), - lr=self.min_lr) + self.optimizer = estimator_settings["optimizer"]( + params=self.model.parameters(), lr=self.min_lr + ) - self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr) + self.scheduler = ExponentialSchedulerPerBatch( + self.optimizer, self.max_lr, self.num_lr + ) self.criterion = estimator_settings["criterion"]() - self.batch_size = int(estimator_settings['batch_size']) + self.batch_size = int(estimator_settings["batch_size"]) self.losses = None self.loss_index = None @@ -74,7 +68,9 @@ def get_lr(self, dataset): out = self.model(batch[0]) loss = self.criterion(out, batch[1]) if self.smooth is not None and i != 0: - losses[i] = self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1] + losses[i] = ( + self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1] + ) else: losses[i] = loss.item() lrs[i] = self.optimizer.param_groups[0]["lr"] @@ -90,13 +86,15 @@ def get_lr(self, dataset): best_loss = losses[i] if losses[i] > (self.divergence_threshold * best_loss): - print(f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}") + print( + f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}" + ) break # find LR where gradient is highest but before global minimum is reached # I added -5 to make sure it is not still in the minimum global_minimum = torch.argmin(losses) - gradient = torch.diff(losses[:(global_minimum - 5)+1]) + gradient = torch.diff(losses[: (global_minimum - 5) + 1]) smallest_gradient = torch.argmin(gradient) suggested_lr = lrs[smallest_gradient] diff --git a/inst/python/MLP.py b/inst/python/MLP.py index 3b8b03b..cd049a6 100644 --- a/inst/python/MLP.py +++ b/inst/python/MLP.py @@ -4,17 +4,18 @@ class MLP(nn.Module): - - def __init__(self, - cat_features: int, - num_features: int, - size_embedding: int, - size_hidden: int, - num_layers: int, - activation=nn.ReLU, - normalization=nn.BatchNorm1d, - dropout=None, - d_out: int = 1): + def __init__( + self, + cat_features: int, + num_features: int, + size_embedding: int, + size_hidden: int, + num_layers: int, + activation=nn.ReLU, + normalization=nn.BatchNorm1d, + dropout=None, + d_out: int = 1, + ): super(MLP, self).__init__() self.name = "MLP" cat_features = int(cat_features) @@ -23,21 +24,25 @@ def __init__(self, size_hidden = int(size_hidden) num_layers = int(num_layers) d_out = int(d_out) - - self.embedding = nn.EmbeddingBag(cat_features + 1, - size_embedding, - padding_idx=0) + + self.embedding = nn.EmbeddingBag( + cat_features + 1, size_embedding, padding_idx=0 + ) if num_features != 0 and num_features is not None: self.num_embedding = NumericalEmbedding(num_features, size_embedding) self.first_layer = nn.Linear(size_embedding, size_hidden) - self.layers = nn.ModuleList(MLPLayer(size_hidden=size_hidden, - normalization=normalization, - activation=activation, - dropout=dropout) - for _ in range(num_layers)) + self.layers = nn.ModuleList( + MLPLayer( + size_hidden=size_hidden, + normalization=normalization, + activation=activation, + dropout=dropout, + ) + for _ in range(num_layers) + ) self.last_norm = normalization(size_hidden) self.head = nn.Linear(size_hidden, d_out) @@ -62,12 +67,14 @@ def forward(self, input): class MLPLayer(nn.Module): - def __init__(self, - size_hidden=64, - normalization=nn.BatchNorm1d, - activation=nn.ReLU, - dropout=0.0, - bias=True): + def __init__( + self, + size_hidden=64, + normalization=nn.BatchNorm1d, + activation=nn.ReLU, + dropout=0.0, + bias=True, + ): super(MLPLayer, self).__init__() self.norm = normalization(size_hidden) self.activation = activation() diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py index cef4b49..9df35d8 100644 --- a/inst/python/ResNet.py +++ b/inst/python/ResNet.py @@ -5,22 +5,23 @@ class ResNet(nn.Module): - - def __init__(self, - cat_features: int, - num_features: int = 0, - size_embedding: int = 256, - size_hidden: int = 256, - num_layers: int = 2, - hidden_factor: int = 1, - activation=nn.ReLU, - normalization=nn.BatchNorm1d, - hidden_dropout=0, - residual_dropout=0, - dim_out: int = 1, - concat_num=True): + def __init__( + self, + cat_features: int, + num_features: int = 0, + size_embedding: int = 256, + size_hidden: int = 256, + num_layers: int = 2, + hidden_factor: int = 1, + activation=nn.ReLU, + normalization=nn.BatchNorm1d, + hidden_dropout=0, + residual_dropout=0, + dim_out: int = 1, + concat_num=True, + ): super(ResNet, self).__init__() - self.name = 'ResNet' + self.name = "ResNet" cat_features = int(cat_features) num_features = int(num_features) size_embedding = int(size_embedding) @@ -28,12 +29,9 @@ def __init__(self, num_layers = int(num_layers) hidden_factor = int(hidden_factor) dim_out = int(dim_out) - - + self.embedding = nn.EmbeddingBag( - num_embeddings=cat_features + 1, - embedding_dim=size_embedding, - padding_idx=0 + num_embeddings=cat_features + 1, embedding_dim=size_embedding, padding_idx=0 ) if num_features != 0 and not concat_num: self.num_embedding = NumericalEmbedding(num_features, size_embedding) @@ -45,9 +43,17 @@ def __init__(self, res_hidden = size_hidden * hidden_factor - self.layers = nn.ModuleList(ResLayer(size_hidden, res_hidden, normalization, - activation, hidden_dropout, residual_dropout) - for _ in range(num_layers)) + self.layers = nn.ModuleList( + ResLayer( + size_hidden, + res_hidden, + normalization, + activation, + hidden_dropout, + residual_dropout, + ) + for _ in range(num_layers) + ) self.last_norm = normalization(size_hidden) @@ -58,7 +64,11 @@ def __init__(self, def forward(self, x): x_cat = x["cat"] x_cat = self.embedding(x_cat) - if "num" in x.keys() and x["num"] is not None and self.num_embedding is not None: + if ( + "num" in x.keys() + and x["num"] is not None + and self.num_embedding is not None + ): x_num = x["num"] # take the average af numerical and categorical embeddings x = (x_cat + self.num_embedding(x_num).mean(dim=1)) / 2 @@ -79,14 +89,15 @@ def forward(self, x): class ResLayer(nn.Module): - - def __init__(self, - size_hidden, - res_hidden, - normalization, - activation, - hidden_dropout=None, - residual_dropout=None): + def __init__( + self, + size_hidden, + res_hidden, + normalization, + activation, + hidden_dropout=None, + residual_dropout=None, + ): super(ResLayer, self).__init__() self.norm = normalization(size_hidden) @@ -114,10 +125,7 @@ def forward(self, input): class NumericalEmbedding(nn.Module): - def __init__(self, - num_embeddings, - embedding_dim, - bias=True): + def __init__(self, num_embeddings, embedding_dim, bias=True): super(NumericalEmbedding, self).__init__() self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim)) if bias: @@ -134,6 +142,3 @@ def forward(self, input): if self.bias is not None: x = x + self.bias[None] return x - - - diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py index 1c95b36..ddbe929 100644 --- a/inst/python/Transformer.py +++ b/inst/python/Transformer.py @@ -18,23 +18,24 @@ def forward(self, x): class Transformer(nn.Module): - - def __init__(self, - cat_features: int, - num_features: int, - num_blocks: int, - dim_token: int, - num_heads: int, - att_dropout, - ffn_dropout, - res_dropout, - dim_hidden: int, - dim_out: int = 1, - head_activation=nn.ReLU, - activation=ReGLU, - ffn_norm=nn.LayerNorm, - head_norm=nn.LayerNorm, - att_norm=nn.LayerNorm): + def __init__( + self, + cat_features: int, + num_features: int, + num_blocks: int, + dim_token: int, + num_heads: int, + att_dropout, + ffn_dropout, + res_dropout, + dim_hidden: int, + dim_out: int = 1, + head_activation=nn.ReLU, + activation=ReGLU, + ffn_norm=nn.LayerNorm, + head_norm=nn.LayerNorm, + att_norm=nn.LayerNorm, + ): super(Transformer, self).__init__() self.name = "Transformer" cat_features = int(cat_features) @@ -44,8 +45,10 @@ def __init__(self, num_heads = int(num_heads) dim_hidden = int(dim_hidden) dim_out = int(dim_out) - - self.categorical_embedding = nn.Embedding(cat_features + 1, dim_token, padding_idx=0) + + self.categorical_embedding = nn.Embedding( + cat_features + 1, dim_token, padding_idx=0 + ) if num_features != 0 and num_features is not None: self.numerical_embedding = NumericalEmbedding(num_features, dim_token) @@ -56,27 +59,35 @@ def __init__(self, self.layers = nn.ModuleList([]) for layer_idx in range(num_blocks): - layer = nn.ModuleDict({ - "attention": nn.MultiheadAttention(dim_token, num_heads, - dropout=att_dropout), - "ffn": FeedForwardBlock(dim_token, dim_hidden, - bias_first=True, - bias_second=True, - dropout=ffn_dropout, - activation=activation), - "attention_res_dropout": nn.Dropout(res_dropout), - "ffn_res_dropout": nn.Dropout(res_dropout), - "ffn_norm": ffn_norm(dim_token) - }) + layer = nn.ModuleDict( + { + "attention": nn.MultiheadAttention( + dim_token, num_heads, dropout=att_dropout + ), + "ffn": FeedForwardBlock( + dim_token, + dim_hidden, + bias_first=True, + bias_second=True, + dropout=ffn_dropout, + activation=activation, + ), + "attention_res_dropout": nn.Dropout(res_dropout), + "ffn_res_dropout": nn.Dropout(res_dropout), + "ffn_norm": ffn_norm(dim_token), + } + ) if layer_idx != 0: layer["attention_norm"] = att_norm(dim_token) self.layers.append(layer) - self.head = Head(dim_token, - bias=True, - activation=head_activation, - normalization=head_norm, - dim_out=dim_out) + self.head = Head( + dim_token, + bias=True, + activation=head_activation, + normalization=head_norm, + dim_out=dim_out, + ) def forward(self, x): mask = torch.where(x["cat"] == 0, True, False) @@ -84,31 +95,34 @@ def forward(self, x): if self.use_numerical: num = self.numerical_embedding(x["num"]) x = torch.cat([cat, num], dim=1) - mask = torch.cat([mask, torch.zeros([x.shape[0], - num.shape[1]], - device=mask.device, - dtype=mask.dtype)], - dim=1) + mask = torch.cat( + [ + mask, + torch.zeros( + [x.shape[0], num.shape[1]], device=mask.device, dtype=mask.dtype + ), + ], + dim=1, + ) else: x = cat x = self.class_token(x) - mask = torch.cat([mask, torch.zeros([x.shape[0], 1], - device=mask.device, - dtype=mask.dtype)], - dim=1) + mask = torch.cat( + [mask, torch.zeros([x.shape[0], 1], device=mask.device, dtype=mask.dtype)], + dim=1, + ) for i, layer in enumerate(self.layers): x_residual = self.start_residual(layer, "attention", x) - if i == len(self.layers)-1: + if i == len(self.layers) - 1: dims = x_residual.shape x_residual = layer["attention"]( x_residual[:, -1].view([dims[0], 1, dims[2]]).transpose(0, 1), x_residual.transpose(0, 1), x_residual.transpose(0, 1), - mask + mask, ) - attn_weights = x_residual[1] x_residual = x_residual[0] x = x[:, -1].view([dims[0], 1, dims[2]]) else: @@ -116,7 +130,7 @@ def forward(self, x): x_residual.transpose(0, 1), x_residual.transpose(0, 1), x_residual.transpose(0, 1), - mask + mask, )[0] x = self.end_residual(layer, "attention", x, x_residual.transpose(0, 1)) @@ -141,14 +155,15 @@ def end_residual(layer, stage, x, x_residual): class FeedForwardBlock(nn.Module): - - def __init__(self, - dim_token, - dim_hidden, - bias_first=True, - bias_second=True, - dropout=0.0, - activation=ReGLU): + def __init__( + self, + dim_token, + dim_hidden, + bias_first=True, + bias_second=True, + dropout=0.0, + activation=ReGLU, + ): super(FeedForwardBlock, self).__init__() self.linear0 = nn.Linear(dim_token, int(dim_hidden * 2), bias=bias_first) self.activation = activation() @@ -164,13 +179,7 @@ def forward(self, x): class Head(nn.Module): - - def __init__(self, - dim_in, - bias, - activation, - normalization, - dim_out): + def __init__(self, dim_in, bias, activation, normalization, dim_out): super(Head, self).__init__() self.normalization = normalization(dim_in) self.activation = activation() @@ -183,17 +192,16 @@ def forward(self, x): x = self.linear(x) return x -class ClassToken(nn.Module): - def __init__(self, - dim_token): +class ClassToken(nn.Module): + def __init__(self, dim_token): super(ClassToken, self).__init__() self.weight = nn.Parameter(torch.empty(dim_token, 1)) nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) def expand(self, dims): new_dims = [1] * (len(dims) - 1) - return self.weight.view(new_dims + [-1]).expand(dims +[-1]) + return self.weight.view(new_dims + [-1]).expand(dims + [-1]) def forward(self, x): return torch.cat([x, self.expand([x.shape[0], 1])], dim=1) diff --git a/man/DeepPatientLevelPrediction.Rd b/man/DeepPatientLevelPrediction.Rd index c97f163..9eb3b36 100644 --- a/man/DeepPatientLevelPrediction.Rd +++ b/man/DeepPatientLevelPrediction.Rd @@ -5,5 +5,6 @@ \alias{DeepPatientLevelPrediction} \title{DeepPatientLevelPrediction} \description{ -A package containing deep learning extensions for developing prediction models using data in the OMOP CDM +A package containing deep learning extensions for developing +prediction models using data in the OMOP CDM } diff --git a/man/setDefaultResNet.Rd b/man/setDefaultResNet.Rd index c26f354..e1d7f46 100644 --- a/man/setDefaultResNet.Rd +++ b/man/setDefaultResNet.Rd @@ -16,6 +16,6 @@ setDefaultResNet( Creates settings for a default ResNet model } \details{ -Model architecture from by https://arxiv.org/abs/2106.11959 . +Model architecture from by https://arxiv.org/abs/2106.11959 . Hyperparameters chosen by a experience on a few prediction problems. } diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd index fcb33ab..b8424a3 100644 --- a/man/setEstimator.Rd +++ b/man/setEstimator.Rd @@ -28,7 +28,8 @@ setEstimator( \item{epochs}{how many epochs to train for} -\item{device}{what device to train on, can be a string or a function to that evaluates +\item{device}{what device to train on, can be a string or a function to +that evaluates to the device during runtime} \item{optimizer}{which optimizer to use} @@ -37,11 +38,15 @@ to the device during runtime} \item{criterion}{loss function to use} -\item{earlyStopping}{If earlyStopping should be used which stops the training of your metric is not improving} +\item{earlyStopping}{If earlyStopping should be used which stops the +training of your metric is not improving} -\item{metric}{either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. -Needs to be a list with function `fun`, mode either `min` or `max` and a `name`, -`fun` needs to be a function that takes in prediction and labels and outputs a score.} +\item{metric}{either `auc` or `loss` or a custom metric to use. This is the +metric used for scheduler and earlyStopping. +Needs to be a list with function `fun`, mode either `min` or `max` and a +`name`, +`fun` needs to be a function that takes in prediction and labels and +outputs a score.} \item{seed}{seed to initialize weights of model with} } diff --git a/man/setMultiLayerPerceptron.Rd b/man/setMultiLayerPerceptron.Rd index f3676cd..a5f96d7 100644 --- a/man/setMultiLayerPerceptron.Rd +++ b/man/setMultiLayerPerceptron.Rd @@ -19,15 +19,18 @@ setMultiLayerPerceptron( \arguments{ \item{numLayers}{Number of layers in network, default: 1:8} -\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)} +\item{sizeHidden}{Amount of neurons in each default layer, +default: 2^(6:9) (64 to 512)} -\item{dropout}{How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)} +\item{dropout}{How much dropout to apply after first linear, +default: seq(0, 0.3, 0.05)} -\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)} +\item{sizeEmbedding}{Size of embedding default: 2^(6:9) (64 to 512)} \item{estimatorSettings}{settings of Estimator created with `setEstimator`} -\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'} +\item{hyperParamSearch}{Which kind of hyperparameter search to use random +sampling or exhaustive grid search. default: 'random'} \item{randomSample}{How many random samples from hyperparameter space to use} diff --git a/man/setResNet.Rd b/man/setResNet.Rd index fbe3d77..4f4c245 100644 --- a/man/setResNet.Rd +++ b/man/setResNet.Rd @@ -21,21 +21,28 @@ setResNet( \arguments{ \item{numLayers}{Number of layers in network, default: 1:16} -\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)} +\item{sizeHidden}{Amount of neurons in each default layer, default: +2^(6:10) (64 to 1024)} -\item{hiddenFactor}{How much to grow the amount of neurons in each ResLayer, default: 1:4} +\item{hiddenFactor}{How much to grow the amount of neurons in each +ResLayer, default: 1:4} -\item{residualDropout}{How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)} +\item{residualDropout}{How much dropout to apply after last linear layer +in ResLayer, default: seq(0, 0.3, 0.05)} -\item{hiddenDropout}{How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)} +\item{hiddenDropout}{How much dropout to apply after first linear layer +in ResLayer, default: seq(0, 0.3, 0.05)} -\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)} +\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) +'(64 to 512)} \item{estimatorSettings}{created with ```setEstimator```} -\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'} +\item{hyperParamSearch}{Which kind of hyperparameter search to use random +sampling or exhaustive grid search. default: 'random'} -\item{randomSample}{How many random samples from hyperparameter space to use} +\item{randomSample}{How many random samples from hyperparameter space +to use} \item{randomSampleSeed}{Random seed to sample hyperparameter combinations} } diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd index 7de56c4..9afcbd5 100644 --- a/man/setTransformer.Rd +++ b/man/setTransformer.Rd @@ -26,7 +26,8 @@ setTransformer( \item{dimToken}{dimension of each token (embedding size)} -\item{dimOut}{dimension of output, usually 1 for binary problems} +\item{dimOut}{dimension of output, usually 1 for binary +problems} \item{numHeads}{number of attention heads} @@ -38,15 +39,19 @@ setTransformer( \item{dimHidden}{dimension of the feedworward block} -\item{dimHiddenRatio}{dimension of the feedforward block as a ratio of dimToken (embedding size)} +\item{dimHiddenRatio}{dimension of the feedforward block as a ratio +of dimToken (embedding size)} \item{estimatorSettings}{created with `setEstimator`} -\item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'} +\item{hyperParamSearch}{what kind of hyperparameter search to do, +default 'random'} -\item{randomSample}{How many samples to use in hyperparameter search if random} +\item{randomSample}{How many samples to use in hyperparameter +search if random} -\item{randomSampleSeed}{Random seed to sample hyperparameter combinations} +\item{randomSampleSeed}{Random seed to sample hyperparameter +combinations} } \description{ A transformer model diff --git a/man/TrainingCache.Rd b/man/trainingCache.Rd similarity index 79% rename from man/TrainingCache.Rd rename to man/trainingCache.Rd index c82bb23..77d2efc 100644 --- a/man/TrainingCache.Rd +++ b/man/trainingCache.Rd @@ -1,7 +1,7 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/TrainingCache-class.R -\name{TrainingCache} -\alias{TrainingCache} +\name{trainingCache} +\alias{trainingCache} \title{TrainingCache} \value{ Whether the provided and cached parameter grid is identical @@ -18,15 +18,15 @@ Parameter caching for training persistence and continuity \section{Methods}{ \subsection{Public methods}{ \itemize{ -\item \href{#method-TrainingCache-new}{\code{TrainingCache$new()}} -\item \href{#method-TrainingCache-isParamGridIdentical}{\code{TrainingCache$isParamGridIdentical()}} -\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}} -\item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}} -\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}} -\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}} -\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}} -\item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}} -\item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}} +\item \href{#method-TrainingCache-new}{\code{trainingCache$new()}} +\item \href{#method-TrainingCache-isParamGridIdentical}{\code{trainingCache$isParamGridIdentical()}} +\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{trainingCache$saveGridSearchPredictions()}} +\item \href{#method-TrainingCache-saveModelParams}{\code{trainingCache$saveModelParams()}} +\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{trainingCache$getGridSearchPredictions()}} +\item \href{#method-TrainingCache-isFull}{\code{trainingCache$isFull()}} +\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{trainingCache$getLastGridSearchIndex()}} +\item \href{#method-TrainingCache-dropCache}{\code{trainingCache$dropCache()}} +\item \href{#method-TrainingCache-clone}{\code{trainingCache$clone()}} } } \if{html}{\out{
}} @@ -35,7 +35,7 @@ Parameter caching for training persistence and continuity \subsection{Method \code{new()}}{ Creates a new training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$new(inDir)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$new(inDir)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -53,7 +53,7 @@ Creates a new training cache Checks whether the parameter grid in the model settings is identical to the cached parameters. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -70,7 +70,7 @@ the cached parameters. \subsection{Method \code{saveGridSearchPredictions()}}{ Saves the grid search results to the training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -87,7 +87,7 @@ Saves the grid search results to the training cache \subsection{Method \code{saveModelParams()}}{ Saves the parameter grid to the training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$saveModelParams(inModelParams)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$saveModelParams(inModelParams)}\if{html}{\out{
}} } \subsection{Arguments}{ @@ -104,7 +104,7 @@ Saves the parameter grid to the training cache \subsection{Method \code{getGridSearchPredictions()}}{ Gets the grid search results from the training cache \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$getGridSearchPredictions()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$getGridSearchPredictions()}\if{html}{\out{
}} } } @@ -114,7 +114,7 @@ Gets the grid search results from the training cache \subsection{Method \code{isFull()}}{ Check if cache is full \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$isFull()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$isFull()}\if{html}{\out{
}} } } @@ -124,7 +124,7 @@ Check if cache is full \subsection{Method \code{getLastGridSearchIndex()}}{ Gets the last index from the cached grid search \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$getLastGridSearchIndex()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$getLastGridSearchIndex()}\if{html}{\out{
}} } } @@ -134,7 +134,7 @@ Gets the last index from the cached grid search \subsection{Method \code{dropCache()}}{ Remove the training cache from the analysis path \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$dropCache()}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$dropCache()}\if{html}{\out{
}} } } @@ -144,7 +144,7 @@ Remove the training cache from the analysis path \subsection{Method \code{clone()}}{ The objects of this class are cloneable with this method. \subsection{Usage}{ -\if{html}{\out{
}}\preformatted{TrainingCache$clone(deep = FALSE)}\if{html}{\out{
}} +\if{html}{\out{
}}\preformatted{trainingCache$clone(deep = FALSE)}\if{html}{\out{
}} } \subsection{Arguments}{ diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 7cd0fee..3e95d1a 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -7,13 +7,13 @@ connectionDetails <- Eunomia::getEunomiaConnectionDetails() Eunomia::createCohorts(connectionDetails) covSet <- FeatureExtraction::createCovariateSettings( - useDemographicsGender = T, - useDemographicsAge = T, - useDemographicsRace = T, - useDemographicsEthnicity = T, - useDemographicsAgeGroup = T, - useConditionGroupEraLongTerm = T, - useDrugEraStartLongTerm = T, + useDemographicsGender = TRUE, + useDemographicsAge = TRUE, + useDemographicsRace = TRUE, + useDemographicsEthnicity = TRUE, + useDemographicsAgeGroup = TRUE, + useConditionGroupEraLongTerm = TRUE, + useDrugEraStartLongTerm = TRUE, endDays = -1 ) @@ -30,10 +30,11 @@ databaseDetails <- PatientLevelPrediction::createDatabaseDetails( cdmDatabaseName = "eunomia" ) -restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings( - firstExposureOnly = T, - washoutPeriod = 365 -) +restrictPlpDataSettings <- + PatientLevelPrediction::createRestrictPlpDataSettings( + firstExposureOnly = TRUE, + washoutPeriod = 365 + ) plpData <- PatientLevelPrediction::getPlpData( databaseDetails = databaseDetails, @@ -41,9 +42,24 @@ plpData <- PatientLevelPrediction::getPlpData( covariateSettings = covSet ) +# add age squared so I have more than one numerical feature +plpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>% + dplyr::rows_append(data.frame( + covariateId = 2002, + covariateName = "Squared age", + analysisId = 2, + conceptId = 0), copy = TRUE) + +squaredAges <- plpData$covariateData$covariates %>% + dplyr::filter(covariateId == 1002) %>% + dplyr::mutate(covariateId = 2002, + covariateValue = .data$covariateValue**2) + +plpData$covariateData$covariates <- plpData$covariateData$covariates %>% + dplyr::rows_append(squaredAges) populationSet <- PatientLevelPrediction::createStudyPopulationSettings( - requireTimeAtRisk = F, + requireTimeAtRisk = FALSE, riskWindowStart = 1, riskWindowEnd = 365 ) @@ -66,33 +82,32 @@ mappedData <- PatientLevelPrediction::MapIds( ) path <- system.file("python", package = "DeepPatientLevelPrediction") -Dataset <- reticulate::import_from_path("Dataset", path = path) +datasetClass <- reticulate::import_from_path("Dataset", path = path) if (is.null(attributes(mappedData)$path)) { # sqlite object attributes(mappedData)$path <- attributes(mappedData)$dbname } -dataset <- Dataset$Data( +dataset <- datasetClass$Data( data = reticulate::r_to_py(normalizePath(attributes(mappedData)$path)), labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount), ) -small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3))) +smallDataset <- torch$utils$data$Subset(dataset, + (1:round(length(dataset) / 3))) modelSettings <- setResNet( numLayers = 1, sizeHidden = 16, hiddenFactor = 1, residualDropout = c(0, 0.2), hiddenDropout = 0, sizeEmbedding = 16, hyperParamSearch = "random", randomSample = 2, - setEstimator(epochs=1, + setEstimator(epochs = 1, learningRate = 3e-4) ) -fitEstimatorPath <- file.path(testLoc, 'fitEstimator') +fitEstimatorPath <- file.path(testLoc, "fitEstimator") if (!dir.exists(fitEstimatorPath)) { dir.create(fitEstimatorPath) } -fitEstimatorResults <- fitEstimator(trainData$Train, - modelSettings = modelSettings, - analysisId = 1, - analysisPath = fitEstimatorPath) - - +fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, + analysisPath = fitEstimatorPath) \ No newline at end of file diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R index 940f9d1..fd3ce92 100644 --- a/tests/testthat/test-Dataset.R +++ b/tests/testthat/test-Dataset.R @@ -1,7 +1,9 @@ test_that("number of num and cat features sum correctly", { testthat::expect_equal( - length(dataset$get_numerical_features()) + length(dataset$get_cat_features()), - dplyr::n_distinct(mappedData$covariates %>% dplyr::collect() %>% + length(dataset$get_numerical_features()) + + length(dataset$get_cat_features()), + dplyr::n_distinct(mappedData$covariates %>% + dplyr::collect() %>% dplyr::pull(covariateId)) ) }) @@ -12,40 +14,112 @@ test_that("length of dataset correct", { expect_equal(length(dataset), dataset$num$shape[0]) expect_equal( length(dataset), - dplyr::n_distinct(mappedData$covariates %>% - dplyr::collect() %>% dplyr::pull(rowId)) + dplyr::n_distinct(mappedData$covariates %>% + dplyr::collect() %>% + dplyr::pull(.data$rowId)) ) }) test_that(".getbatch works", { - batch_size <- 16 + batchSize <- 16 # get one sample out <- dataset[10] - - # output should be a list of two items, the batch in pos 1 and targets in pos 2, + + # output should be a list of two items, + # the batch in pos 1 and targets in pos 2, # the batch is what goes to the model expect_equal(length(out), 2) - + # targets should be binary expect_true(out[[2]]$item() %in% c(0, 1)) - + # shape of batch is correct expect_equal(length(out[[1]]), 2) expect_equal(out[[1]]$cat$shape[0], 1) expect_equal(out[[1]]$num$shape[0], 1) - + # shape of target expect_equal(out[[2]]$shape$numel(), 1) - + # get a whole batch - out <- dataset[10:(10 + batch_size)] - + out <- dataset[10:(10 + batchSize)] + expect_equal(length(out), 2) expect_true(all(out[[2]]$numpy() %in% c(0, 1))) - + expect_equal(length(out[[1]]), 2) expect_equal(out[[1]]$cat$shape[0], 16) expect_equal(out[[1]]$num$shape[0], 16) - + expect_equal(out[[2]]$shape[0], 16) }) + +test_that("Column order is preserved when features are missing", { + # important for transfer learning and external validation + + reducedCovData <- Andromeda::copyAndromeda(trainData$Train$covariateData) + + # remove one numerical and one categorical + numFeature <- 1002 # continous age + catFeature <- 4285898210 # a random common cat feature + reducedCovData$covariates <- trainData$Train$covariateData$covariates %>% + dplyr::filter(!(covariateId %in% c(numFeature, catFeature))) + reducedCovData$covariates <- trainData$Train$covariateData$covariates %>% + dplyr::filter(!(covariateId %in% c(numFeature, catFeature))) + + mappedReducedData <- PatientLevelPrediction::MapIds( + reducedCovData, + mapping = mappedData$mapping + ) + + catColumn <- mappedData$mapping %>% + dplyr::filter(covariateId == catFeature) %>% + dplyr::pull("columnId") + numColumn <- mappedData$mapping %>% + dplyr::filter(covariateId == numFeature) %>% + dplyr::pull("columnId") + + reducedDataset <- datasetClass$Data( + data = + reticulate::r_to_py(normalizePath(attributes(mappedReducedData)$dbname)), + labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount), + numerical_features = dataset$numerical_features$to_list() + ) + + # should have same number of columns + expect_equal(dataset$num$shape[[1]], reducedDataset$num$shape[[1]]) + + # all zeros in column with removed feature, -1 because r to py + expect_true(reducedDataset$num[, numColumn - 1]$sum()$item() == 0) + + # all other columns are same + indexReduced <- !torch$isin(torch$arange(reducedDataset$num$shape[[1]]), + numColumn - 1) + index <- !torch$isin(torch$arange(dataset$num$shape[[1]]), + numColumn - 1) + + expect_equal(reducedDataset$num[, indexReduced]$mean()$item(), + dataset$num[, index]$mean()$item()) + + # cat data should have same counts of all columnIds + # expect the one that was removed + # not same counts for removed feature + expect_false(isTRUE(all.equal((reducedDataset$cat == catColumn)$sum()$item(), + (dataset$cat == catColumn)$sum()$item()))) + + # get counts + counts <- as.array(torch$unique(dataset$cat, + return_counts = TRUE)[[2]]$numpy()) + counts <- counts[-(catColumn + 1)] # +1 because py_to_r + counts <- counts[-1] + + reducedCounts <- as.array(torch$unique(reducedDataset$cat, + return_counts = TRUE)[[2]]$numpy()) + reducedCounts <- reducedCounts[-(catColumn + 1)] # +1 because py_to_r + reducedCounts <- reducedCounts[-1] + + expect_false(isTRUE(all.equal(counts, reducedCounts))) + expect_equal(dataset$get_cat_features()$max(), + reducedDataset$get_cat_features()$max()) + +}) diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R index b4dd0a4..ba82835 100644 --- a/tests/testthat/test-Estimator.R +++ b/tests/testthat/test-Estimator.R @@ -1,5 +1,5 @@ -catFeatures <- small_dataset$dataset$get_cat_features()$shape[[1]] -numFeatures <- small_dataset$dataset$get_numerical_features()$shape[[1]] +catFeatures <- smallDataset$dataset$get_cat_features()$max() +numFeatures <- smallDataset$dataset$get_numerical_features()$max() modelParameters <- list( cat_features = catFeatures, @@ -10,18 +10,20 @@ modelParameters <- list( hidden_factor = 2 ) -estimatorSettings <- setEstimator(learningRate = 3e-4, - weightDecay = 0.0, - batchSize = 128, - epochs = 5, - device = 'cpu', - seed=42, - optimizer=torch$optim$AdamW, - criterion=torch$nn$BCEWithLogitsLoss, - metric='loss', - scheduler= list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau, - params=list(patience=1)), - earlyStopping=NULL) +estimatorSettings <- + setEstimator(learningRate = 3e-4, + weightDecay = 0.0, + batchSize = 128, + epochs = 5, + device = "cpu", + seed = 42, + optimizer = torch$optim$AdamW, + criterion = torch$nn$BCEWithLogitsLoss, + metric = "loss", + scheduler = + list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau, + params = list(patience = 1)), + earlyStopping = NULL) modelType <- "ResNet" estimator <- createEstimator(modelType = modelType, @@ -32,11 +34,12 @@ test_that("Estimator initialization works", { # count parameters in both instances path <- system.file("python", package = "DeepPatientLevelPrediction") - ResNet <- reticulate::import_from_path(modelType, path=path)[[modelType]] + resNet <- reticulate::import_from_path(modelType, path = path)[[modelType]] testthat::expect_equal( - sum(reticulate::iterate(estimator$model$parameters(), function(x) x$numel())), - sum(reticulate::iterate(do.call(ResNet, modelParameters)$parameters(), + sum(reticulate::iterate(estimator$model$parameters(), + function(x) x$numel())), + sum(reticulate::iterate(do.call(resNet, modelParameters)$parameters(), function(x) x$numel())) ) @@ -47,8 +50,8 @@ test_that("Estimator initialization works", { }) test_that("Estimator detects wrong inputs", { - - testthat::expect_error(setEstimator(learningRate='notAuto')) + + testthat::expect_error(setEstimator(learningRate = "notAuto")) testthat::expect_error(setEstimator(weightDecay = -1)) testthat::expect_error(setEstimator(weightDecay = "text")) testthat::expect_error(setEstimator(batchSize = 0)) @@ -61,7 +64,7 @@ test_that("Estimator detects wrong inputs", { }) sink(nullfile()) -estimator$fit(small_dataset, small_dataset) +estimator$fit(smallDataset, smallDataset) sink() test_that("estimator fitting works", { @@ -70,72 +73,74 @@ test_that("estimator fitting works", { expect_true(!is.null(estimator$best_score$loss)) expect_true(!is.null(estimator$best_score$auc)) - old_weights <- estimator$model$head$weight$mean()$item() + oldWeights <- estimator$model$head$weight$mean()$item() sink(nullfile()) - estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule) + estimator$fit_whole_training_set(smallDataset, estimator$learn_rate_schedule) sink() - expect_equal(estimator$optimizer$param_groups[[1]]$lr, tail(estimator$learn_rate_schedule, 1)[[1]]) + expect_equal(estimator$optimizer$param_groups[[1]]$lr, + tail(estimator$learn_rate_schedule, 1)[[1]]) - new_weights <- estimator$model$head$weight$mean()$item() + newWeights <- estimator$model$head$weight$mean()$item() # model should be updated when refitting - expect_true(old_weights != new_weights) + expect_true(oldWeights != newWeights) estimator$save(testLoc, "estimator.pt") expect_true(file.exists(file.path(testLoc, "estimator.pt"))) - + sink(nullfile()) preds <- estimator$predict_proba(dataset) sink() - + expect_lt(max(preds), 1) expect_gt(min(preds), 0) - + sink(nullfile()) - classes <- estimator$predict(small_dataset, threshold = 0.5) + classes <- estimator$predict(smallDataset, threshold = 0.5) sink() expect_equal(all(unique(classes) %in% c(0, 1)), TRUE) - + sink(nullfile()) - classes <- estimator$predict(small_dataset$dataset) + classes <- estimator$predict(smallDataset$dataset) sink() expect_equal(all(unique(classes) %in% c(0, 1)), TRUE) - + estimatorSettings <- setEstimator(learningRate = 3e-4, weightDecay = 0.0, batchSize = 128, epochs = 5, - device = 'cpu', - metric= "loss") - - estimator <- createEstimator(modelType=modelType, - modelParameters=modelParameters, - estimatorSettings=estimatorSettings) - + device = "cpu", + metric = "loss") + + estimator <- createEstimator(modelType = modelType, + modelParameters = modelParameters, + estimatorSettings = estimatorSettings) + sink(nullfile()) - estimator$fit(small_dataset, small_dataset) + estimator$fit(smallDataset, smallDataset) sink() - + expect_equal(estimator$metric$mode, "min") expect_equal(estimator$metric$name, "loss") - + sink(nullfile()) - estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule) + estimator$fit_whole_training_set(smallDataset, estimator$learn_rate_schedule) sink() - + expect_equal(estimator$learn_rate_schedule[[estimator$best_epoch]], estimator$optimizer$param_groups[[1]]$lr) - + }) test_that("early stopping works", { - EarlyStopping <- reticulate::import_from_path("Estimator", path=path)$EarlyStopping - earlyStop <- EarlyStopping(patience = 3, delta = 0, verbose = FALSE) - - + earlyStopping <- + reticulate::import_from_path("Estimator", path = path)$EarlyStopping + earlyStop <- earlyStopping(patience = 3, delta = 0, verbose = FALSE) + + testthat::expect_true(is.null(earlyStop$best_score)) testthat::expect_false(earlyStop$early_stop) earlyStop(0.5) @@ -153,15 +158,19 @@ test_that("Estimator fit function works", { expect_equal(attr(fitEstimatorResults, "modelType"), "binary") expect_equal(attr(fitEstimatorResults, "saveType"), "file") fakeTrainData <- trainData - fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake")) - expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc)) + fakeTrainData$train$covariateData <- list(fakeCovData = c("Fake")) + expect_error(fitEstimator(fakeTrainData$train, + modelSettings, analysisId = 1, + analysisPath = testLoc)) }) test_that("predictDeepEstimator works", { # input is an estimator and a dataset sink(nullfile()) - predictions <- predictDeepEstimator(estimator, small_dataset, cohort = trainData$Train$labels) + predictions <- predictDeepEstimator(estimator, + smallDataset, + cohort = trainData$Train$labels) sink() expect_lt(max(predictions$value), 1) @@ -181,22 +190,23 @@ test_that("predictDeepEstimator works", { }) test_that("batchToDevice works", { - batch_to_device <- reticulate::import_from_path("Estimator", path=path)$batch_to_device + batchToDevice <- reticulate::import_from_path("Estimator", + path = path)$batch_to_device # since we can't test moving to gpu there is a fake device called meta # which we can use to test of the device is updated estimator$device <- "meta" b <- 1:10 - batch <- batch_to_device(dataset[b], device=estimator$device) - + batch <- batchToDevice(dataset[b], device = estimator$device) + devices <- lapply( lapply(unlist(batch, recursive = TRUE), function(x) x$device), function(x) x == torch$device(type = "meta") ) # test that all are meta expect_true(all(devices == TRUE)) - - numDevice <- batch_to_device(dataset[b][[1]]$num, device=estimator$device) - expect_true(numDevice$device==torch$device(type="meta")) + + numDevice <- batchToDevice(dataset[b][[1]]$num, device = estimator$device) + expect_true(numDevice$device == torch$device(type = "meta")) }) test_that("Estimator without earlyStopping works", { @@ -205,19 +215,19 @@ test_that("Estimator without earlyStopping works", { weightDecay = 0.0, batchSize = 128, epochs = 1, - device = 'cpu', + device = "cpu", earlyStopping = NULL) - + estimator2 <- createEstimator(modelType = modelType, modelParameters = modelParameters, - estimatorSettings=estimatorSettings) + estimatorSettings = estimatorSettings) sink(nullfile()) - estimator2$fit(small_dataset, small_dataset) + estimator2$fit(smallDataset, smallDataset) sink() - + expect_null(estimator2$early_stopper) expect_true(!is.null(estimator2$best_epoch)) - + }) test_that("Early stopper can use loss and stops early", { @@ -225,78 +235,89 @@ test_that("Early stopper can use loss and stops early", { weightDecay = 0.0, batchSize = 128, epochs = 10, - device = 'cpu', - earlyStopping =list(useEarlyStopping=TRUE, - params = list(mode=c('min'), - patience=1)), - metric = 'loss', - seed=42) - + device = "cpu", + earlyStopping = + list(useEarlyStopping = TRUE, + params = list(mode = c("min"), + patience = 1)), + metric = "loss", + seed = 42) + estimator <- createEstimator(modelType = modelType, modelParameters = modelParameters, estimatorSettings = estimatorSettings) sink(nullfile()) - estimator$fit(small_dataset, small_dataset) + estimator$fit(smallDataset, smallDataset) sink() - + expect_true(estimator$best_epoch < estimator$epochs) - + }) -test_that('Custom metric in estimator works', { - - metric_fun <- function(predictions, labels) { - pr <- PRROC::pr.curve(scores.class0 = torch$sigmoid(predictions)$numpy(), +test_that("Custom metric in estimator works", { + + metricFun <- function(predictions, labels) { + pr <- PRROC::pr.curve(scores.class0 = torch$sigmoid(predictions)$numpy(), weights.class0 = labels$numpy()) auprc <- pr$auc.integral reticulate::r_to_py(auprc) } - + estimatorSettings <- setEstimator(learningRate = 3e-4, weightDecay = 0.0, batchSize = 128, device = "cpu", epochs = 1, - metric=list(fun=metric_fun, - name="auprc", - mode="max")) + metric = list(fun = metricFun, + name = "auprc", + mode = "max")) estimator <- createEstimator(modelType = modelType, modelParameters = modelParameters, estimatorSettings = estimatorSettings) expect_true(is.function(estimator$metric$fun)) expect_true(is.character(estimator$metric$name)) expect_true(estimator$metric$mode %in% c("min", "max")) - + sink(nullfile()) - estimator$fit(small_dataset, small_dataset) + estimator$fit(smallDataset, smallDataset) sink() - - expect_true(estimator$best_score[["auprc"]]>0) + + expect_true(estimator$best_score[["auprc"]] > 0) }) -test_that("setEstimator with paramsToTune is correctly added to hyperparameters", { - estimatorSettings <- setEstimator(learningRate=c(3e-4,1e-3), - batchSize=128, - epochs=1, - device="cpu", - metric=c("auc", "auprc"), - earlyStopping = list(useEarlyStopping=TRUE, - params=list(patience=c(4,10)))) +test_that("setEstimator with hyperparameters", { + estimatorSettings <- + setEstimator(learningRate = c(3e-4, 1e-3), + batchSize = 128, + epochs = 1, + device = "cpu", + metric = c("auc", "auprc"), + earlyStopping = list(useEarlyStopping = TRUE, + params = list(patience = c(4, 10)))) modelSettings <- setResNet(numLayers = 1, sizeHidden = 64, hiddenFactor = 1, residualDropout = 0.2, - hiddenDropout = 0.2,sizeEmbedding = 128, + hiddenDropout = 0.2, sizeEmbedding = 128, estimatorSettings = estimatorSettings, hyperParamSearch = "grid") - + expect_true(length(modelSettings$param) == 8) - expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.metric)))==2) - expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.learningRate)))==2) - expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.earlyStopping.patience)))==2) - fitParams <- names(modelSettings$param[[1]])[grepl("^estimator", names(modelSettings$param[[1]]))] - - estimatorSettings2 <- fillEstimatorSettings(estimatorSettings, fitParams, paramSearch=modelSettings$param[[8]]) - + expect_true(length(unique(lapply(modelSettings$param, + function(x) x$estimator.metric))) == 2) + expect_true(length(unique(lapply(modelSettings$param, + function(x) x$estimator.learningRate))) == 2) + expect_true(length(unique(lapply(modelSettings$param, function(x) { + x$estimator.earlyStopping.patience + }))) == 2) + + fitParams <- + names(modelSettings$param[[1]])[grepl("^estimator", + names(modelSettings$param[[1]]))] + + estimatorSettings2 <- + fillEstimatorSettings(estimatorSettings, fitParams, + paramSearch = modelSettings$param[[8]]) + expect_equal(estimatorSettings2$learningRate, 1e-3) expect_equal(as.character(estimatorSettings2$metric), "auprc") expect_equal(estimatorSettings2$earlyStopping$params$patience, 10) @@ -304,56 +325,69 @@ test_that("setEstimator with paramsToTune is correctly added to hyperparameters" test_that("device as a function argument works", { getDevice <- function() { - dev <- Sys.getenv("testDeepPLPDevice") - if (dev == ""){ - dev = "cpu" - } else{ + dev <- Sys.getenv("testDeepPLPDevice") + if (dev == "") { + dev <- "cpu" + } else { dev } } - estimatorSettings <- setEstimator(device=getDevice, + estimatorSettings <- setEstimator(device = getDevice, learningRate = 3e-4) - - model <- setDefaultResNet(estimatorSettings = estimatorSettings) + + model <- setDefaultResNet(estimatorSettings = estimatorSettings) model$param[[1]]$catFeatures <- 10 estimator <- createEstimator(modelType = modelType, modelParameters = model$param[[1]], estimatorSettings = estimatorSettings) - + expect_equal(estimator$device, "cpu") - + Sys.setenv("testDeepPLPDevice" = "meta") - - estimatorSettings <- setEstimator(device=getDevice, + + estimatorSettings <- setEstimator(device = getDevice, learningRate = 3e-4) - - model <- setDefaultResNet(estimatorSettings = estimatorSettings) + + model <- setDefaultResNet(estimatorSettings = estimatorSettings) model$param[[1]]$catFeatures <- 10 - + estimator <- createEstimator(modelType = modelType, modelParameters = model$param[[1]], estimatorSettings = estimatorSettings) - + expect_equal(estimator$device, "meta") - + Sys.unsetenv("testDeepPLPDevice") - - }) -test_that("estimatorSettings can be saved and loaded with correct python objects", { +}) + +test_that("estimatorSettings can be saved and loaded with python objects", { settings <- setEstimator() - saveRDS(settings,file=file.path(testLoc, 'settings.RDS')) - - loadedSettings <- readRDS(file = file.path(testLoc, 'settings.RDS')) - + saveRDS(settings, file = file.path(testLoc, "settings.RDS")) + + loadedSettings <- readRDS(file = file.path(testLoc, "settings.RDS")) optimizer <- loadedSettings$optimizer() scheduler <- loadedSettings$scheduler() criterion <- loadedSettings$criterion() - + testthat::expect_false(reticulate::py_is_null_xptr(optimizer)) testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun)) testthat::expect_false(reticulate::py_is_null_xptr(criterion)) }) + +test_that("evaluation works on predictDeepEstimator output", { + + prediction <- predictDeepEstimator(plpModel = fitEstimatorResults, + data = trainData$Test, + cohort = trainData$Test$labels) + prediction$evaluationType <- 'Validation' + + evaluation <- evaluatePlp(prediction, "evaluationType") + + expect_length(evaluation, 5) + expect_s3_class(evaluation, "plpEvaluation") + + }) \ No newline at end of file diff --git a/tests/testthat/test-LRFinder.R b/tests/testthat/test-LRFinder.R index 0a5d700..a42416b 100644 --- a/tests/testthat/test-LRFinder.R +++ b/tests/testthat/test-LRFinder.R @@ -1,77 +1,84 @@ -ResNet <- reticulate::import_from_path("ResNet", path)$ResNet -lrFinderClass <- reticulate::import_from_path("LrFinder", path=path)$LrFinder +resNet <- reticulate::import_from_path("ResNet", path)$ResNet test_that("LR scheduler that changes per batch works", { - - model <- ResNet(cat_features = 10L, num_features = 1L, + + model <- resNet(cat_features = 10L, num_features = 1L, size_embedding = 32L, size_hidden = 64L, num_layers = 1L, hidden_factor = 1L) - optimizer <- torch$optim$AdamW(model$parameters(), lr=1e-7) - - ExponentialSchedulerPerBatch <- reticulate::import_from_path("LrFinder", - path=path)$ExponentialSchedulerPerBatch - scheduler <- ExponentialSchedulerPerBatch(optimizer, - end_lr = 1e-2, - num_iter = 5) + optimizer <- torch$optim$AdamW(model$parameters(), lr = 1e-7) + + + exponentialSchedulerPerBatch <- + reticulate::import_from_path("LrFinder", + path = path)$ExponentialSchedulerPerBatch + scheduler <- exponentialSchedulerPerBatch(optimizer, + end_lr = 1e-2, + num_iter = 5) expect_equal(scheduler$last_epoch, 0) expect_equal(scheduler$optimizer$param_groups[[1]]$lr, 1e-7) - + for (i in 1:5) { optimizer$step() scheduler$step() } - + expect_equal(scheduler$last_epoch, 5) - expect_equal(scheduler$optimizer$param_groups[[1]]$lr, (1e-7 * (0.01 / 1e-7) ^ (5 / 4))) - + expect_equal(scheduler$optimizer$param_groups[[1]]$lr, + (1e-7 * (0.01 / 1e-7) ^ (5 / 4))) + }) test_that("LR finder works", { - lrFinder <- createLRFinder(modelType="ResNet", - modelParameters = list(cat_features=length(dataset$get_cat_features()), - num_features=length(dataset$get_numerical_features()), - size_embedding=32L, - size_hidden=64L, - num_layers=1L, - hidden_factor=1L), - estimatorSettings = setEstimator(batchSize = 32L, - seed=42), - lrSettings = list(minLr=3e-4, - maxLr=10.0, - numLr=20L, - divergenceThreshold=1.1)) - + lrFinder <- + createLRFinder(modelType = "ResNet", + modelParameters = + list(cat_features = + dataset$get_cat_features()$max(), + num_features = + dataset$get_numerical_features()$max(), + size_embedding = 32L, + size_hidden = 64L, + num_layers = 1L, + hidden_factor = 1L), + estimatorSettings = setEstimator(batchSize = 32L, + seed = 42), + lrSettings = list(minLr = 3e-4, + maxLr = 10.0, + numLr = 20L, + divergenceThreshold = 1.1)) + lr <- lrFinder$get_lr(dataset) - - expect_true(lr<=10.0) - expect_true(lr>=3e-4) + + expect_true(lr <= 10.0) + expect_true(lr >= 3e-4) }) test_that("LR finder works with device specified by a function", { - - deviceFun <- function(){ - dev = "cpu" + + deviceFun <- function() { + dev <- "cpu" + dev } - lrFinder <- createLRFinder(model = "ResNet", - modelParameters = list(cat_features=length(dataset$get_cat_features()), - num_features=length(dataset$get_numerical_features()), - size_embedding=8L, - size_hidden=16L, - num_layers=1L, - hidden_factor=1L), - estimatorSettings = setEstimator(batchSize=32L, - seed = 42, - device = deviceFun), - lrSettings = list(minLr=3e-4, - maxLr=10.0, - numLr=20L, - divergenceThreshold=1.1) - ) + lrFinder <- createLRFinder( + model = "ResNet", + modelParameters = + list(cat_features = dataset$get_cat_features()$max(), + num_features = dataset$get_numerical_features()$max(), + size_embedding = 8L, + size_hidden = 16L, + num_layers = 1L, + hidden_factor = 1L), + estimatorSettings = setEstimator(batchSize = 32L, + seed = 42, + device = deviceFun), + lrSettings = list(minLr = 3e-4, + maxLr = 10.0, + numLr = 20L, + divergenceThreshold = 1.1) + ) lr <- lrFinder$get_lr(dataset) - - expect_true(lr<=10.0) - expect_true(lr>=3e-4) - - -}) \ No newline at end of file + + expect_true(lr <= 10.0) + expect_true(lr >= 3e-4) +}) diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MLP.R index 5cbd7a5..a470664 100644 --- a/tests/testthat/test-MLP.R +++ b/tests/testthat/test-MLP.R @@ -5,11 +5,11 @@ modelSettings <- setMultiLayerPerceptron( dropout = c(0.1), sizeEmbedding = 32, estimatorSettings = setEstimator( - learningRate=c(3e-4), + learningRate = c(3e-4), weightDecay = c(1e-6), - seed=42, - batchSize=128, - epochs=1 + seed = 42, + batchSize = 128, + epochs = 1 ), hyperParamSearch = "random", randomSample = 1 @@ -21,12 +21,13 @@ test_that("setMultiLayerPerceptron works", { testthat::expect_equal(modelSettings$fitFunction, "fitEstimator") testthat::expect_true(length(modelSettings$param) > 0) - - expect_error(setMultiLayerPerceptron(numLayers=1, + + expect_error(setMultiLayerPerceptron(numLayers = 1, sizeHidden = 128, - dropout= 0.2, + dropout = 0.2, sizeEmbedding = 128, - estimatorSettings = setEstimator(learningRate=3e-4), + estimatorSettings = + setEstimator(learningRate = 3e-4), randomSample = 2)) }) @@ -41,16 +42,17 @@ results <- tryCatch( analysisName = "Testing Deep Learning", populationSettings = populationSet, splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), - sampleSettings = PatientLevelPrediction::createSampleSettings(), # none - featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + sampleSettings = PatientLevelPrediction::createSampleSettings(), + featureEngineeringSettings = + PatientLevelPrediction::createFeatureEngineeringSettings(), preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), executeSettings = PatientLevelPrediction::createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F + runSplitData = TRUE, + runSampleData = FALSE, + runfeatureEngineering = FALSE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = FALSE ), saveDirectory = file.path(testLoc, "MLP") ) @@ -73,7 +75,9 @@ test_that("MLP with runPlp working checks", { # check prediction same size as pop testthat::expect_equal(nrow(results$prediction %>% - dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population)) + dplyr::filter(evaluationType %in% c("Train", + "Test"))), + nrow(population)) # check prediction between 0 and 1 testthat::expect_gte(min(results$prediction$value), 0) @@ -82,15 +86,15 @@ test_that("MLP with runPlp working checks", { test_that("MLP nn-module works ", { - MLP <- reticulate::import_from_path("MLP", path=path)$MLP - model <- MLP( - cat_features = 5, - num_features = 1, + mlp <- reticulate::import_from_path("MLP", path = path)$MLP + model <- mlp( + cat_features = 5, + num_features = 1, size_embedding = 5, - size_hidden = 16, + size_hidden = 16, num_layers = 1, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, dropout = 0.3 ) @@ -100,8 +104,8 @@ test_that("MLP nn-module works ", { expect_equal(pars, 489) input <- list() - input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype=torch$long) - input$num <- torch$randn(10L, 1L, dtype=torch$float32) + input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long) + input$num <- torch$randn(10L, 1L, dtype = torch$float32) output <- model(input) @@ -110,14 +114,14 @@ test_that("MLP nn-module works ", { expect_equal(output$shape[0], 10L) input$num <- NULL - model <- MLP( - cat_features = 5L, - num_features = 0, + model <- mlp( + cat_features = 5L, + num_features = 0, size_embedding = 5L, - size_hidden = 16L, + size_hidden = 16L, num_layers = 1L, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, dropout = 0.3, d_out = 1L ) @@ -129,49 +133,48 @@ test_that("MLP nn-module works ", { test_that("Errors are produced by settings function", { randomSample <- 2 - + expect_error(setMultiLayerPerceptron( - numLayers = 1, - sizeHidden = 128, - dropout = 0.0, - sizeEmbedding = 128, - hyperParamSearch = 'random', - estimatorSettings = setEstimator( - learningRate = 'auto', - weightDecay = c(1e-3), - batchSize = 1024, - epochs = 30, - device="cpu"))) - + numLayers = 1, + sizeHidden = 128, + dropout = 0.0, + sizeEmbedding = 128, + hyperParamSearch = "random", + estimatorSettings = + setEstimator( + learningRate = "auto", + weightDecay = c(1e-3), + batchSize = 1024, + epochs = 30, + device = "cpu"))) }) test_that("Can upload results to database", { - cohortDefinitions = data.frame( - cohortName = c('blank1'), - cohortId = c(1), - json = c('json') + cohortDefinitions <- data.frame( + cohortName = c("blank1"), + cohortId = c(1), + json = c("json") ) - + sink(nullfile()) - sqliteFile <- insertResultsToSqlite(resultLocation = file.path(testLoc, "MLP"), - cohortDefinitions = cohortDefinitions) + sqliteFile <- + insertResultsToSqlite(resultLocation = file.path(testLoc, "MLP"), + cohortDefinitions = cohortDefinitions) sink() - + testthat::expect_true(file.exists(sqliteFile)) - - cdmDatabaseSchema <- 'main' - ohdsiDatabaseSchema <- 'main' + + cdmDatabaseSchema <- "main" + ohdsiDatabaseSchema <- "main" connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = 'sqlite', + dbms = "sqlite", server = sqliteFile ) conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) - targetDialect <- 'sqlite' - + targetDialect <- "sqlite" + # check the results table is populated - sql <- 'select count(*) as N from main.performances;' + sql <- "select count(*) as N from main.performances;" res <- DatabaseConnector::querySql(conn, sql) - testthat::expect_true(res$N[1]>0) - - -}) \ No newline at end of file + testthat::expect_true(res$N[1] > 0) +}) diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R index 29dfb8e..76e3d9f 100644 --- a/tests/testthat/test-ResNet.R +++ b/tests/testthat/test-ResNet.R @@ -6,33 +6,34 @@ resSet <- setResNet( residualDropout = 0.1, hiddenDropout = 0.1, sizeEmbedding = 32, - estimatorSettings = setEstimator(learningRate="auto", + estimatorSettings = setEstimator(learningRate = "auto", weightDecay = c(1e-6), - seed=42, + seed = 42, batchSize = 128, - epochs=1), + epochs = 1), hyperParamSearch = "random", randomSample = 1, ) test_that("setResNet works", { testthat::expect_s3_class(object = resSet, class = "modelSettings") - + testthat::expect_equal(resSet$fitFunction, "fitEstimator") - + testthat::expect_true(length(resSet$param) > 0) - + expect_error(setResNet(numLayers = 2, sizeHidden = 32, hiddenFactor = 2, residualDropout = 0.1, hiddenDropout = 0.1, sizeEmbedding = 32, - estimatorSettings = setEstimator(learningRate=c(3e-4), - weightDecay = c(1e-6), - seed=42, - batchSize = 128, - epochs=1), + estimatorSettings = + setEstimator(learningRate = c(3e-4), + weightDecay = c(1e-6), + seed = 42, + batchSize = 128, + epochs = 1), hyperParamSearch = "random", randomSample = 2)) }) @@ -48,16 +49,17 @@ res2 <- tryCatch( analysisName = "Testing Deep Learning", populationSettings = populationSet, splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), - sampleSettings = PatientLevelPrediction::createSampleSettings(), # none - featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + sampleSettings = PatientLevelPrediction::createSampleSettings(), + featureEngineeringSettings = + PatientLevelPrediction::createFeatureEngineeringSettings(), preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), executeSettings = PatientLevelPrediction::createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F + runSplitData = TRUE, + runSampleData = FALSE, + runfeatureEngineering = FALSE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = FALSE ), saveDirectory = file.path(testLoc, "ResNet") ) @@ -71,17 +73,19 @@ sink() test_that("ResNet with runPlp working checks", { testthat::expect_false(is.null(res2)) - + # check structure testthat::expect_true("prediction" %in% names(res2)) testthat::expect_true("model" %in% names(res2)) testthat::expect_true("covariateSummary" %in% names(res2)) testthat::expect_true("performanceEvaluation" %in% names(res2)) - + # check prediction same size as pop testthat::expect_equal(nrow(res2$prediction %>% - dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population)) - + dplyr::filter(evaluationType %in% c("Train", + "Test"))), + nrow(population)) + # check prediction between 0 and 1 testthat::expect_gte(min(res2$prediction$value), 0) testthat::expect_lte(max(res2$prediction$value), 1) @@ -89,45 +93,45 @@ test_that("ResNet with runPlp working checks", { test_that("ResNet nn-module works ", { - ResNet <- reticulate::import_from_path("ResNet", path=path)$ResNet - model <- ResNet( - cat_features = 5, - num_features = 1, + resNet <- reticulate::import_from_path("ResNet", path = path)$ResNet + model <- resNet( + cat_features = 5, + num_features = 1, size_embedding = 5, - size_hidden = 16, - num_layers = 1, + size_hidden = 16, + num_layers = 1, hidden_factor = 2, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, hidden_dropout = 0.3, residual_dropout = 0.3 ) - + pars <- sum(reticulate::iterate(model$parameters(), function(x) x$numel())) - + # expected number of parameters expect_equal(pars, 1295) - + input <- list() input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long) input$num <- torch$randn(10L, 1L, dtype = torch$float32) - - + + output <- model(input) - + # output is correct shape expect_equal(output$shape[0], 10L) - + input$num <- NULL - model <- ResNet( - cat_features = 5, - num_features = 0, + model <- resNet( + cat_features = 5, + num_features = 0, size_embedding = 5, - size_hidden = 16, - num_layers = 1, + size_hidden = 16, + num_layers = 1, hidden_factor = 2, activation = torch$nn$ReLU, - normalization = torch$nn$BatchNorm1d, + normalization = torch$nn$BatchNorm1d, hidden_dropout = 0.3, residual_dropout = 0.3 ) @@ -139,60 +143,59 @@ test_that("ResNet nn-module works ", { test_that("Default Resnet works", { defaultResNet <- setDefaultResNet() params <- defaultResNet$param[[1]] - + expect_equal(params$numLayers, 6) expect_equal(params$sizeHidden, 512) expect_equal(params$hiddenFactor, 2) expect_equal(params$residualDropout, 0.1) expect_equal(params$hiddenDropout, 0.4) expect_equal(params$sizeEmbedding, 256) - -}) + +}) test_that("Errors are produced by settings function", { randomSample <- 2 - - expect_error(setResNet( - numLayers = 1, - sizeHidden = 128, - hiddenFactor = 1, - residualDropout = 0.0, - hiddenDropout = 0.0, - sizeEmbedding = 128, - estimatorSettings = setEstimator(weightDecay = 1e-6, - learningRate = 0.01, - seed = 42), - hyperParamSearch = 'random', - randomSample = randomSample)) + + expect_error(setResNet(numLayers = 1, + sizeHidden = 128, + hiddenFactor = 1, + residualDropout = 0.0, + hiddenDropout = 0.0, + sizeEmbedding = 128, + estimatorSettings = setEstimator(weightDecay = 1e-6, + learningRate = 0.01, + seed = 42), + hyperParamSearch = "random", + randomSample = randomSample)) }) -test_that("Can upload results to database", { - cohortDefinitions = data.frame( - cohortName = c('blank1'), - cohortId = c(1), - json = c('json') +test_that("Can upload results to database", { + cohortDefinitions <- data.frame( + cohortName = c("blank1"), + cohortId = c(1), + json = c("json") ) - + sink(nullfile()) - sqliteFile <- insertResultsToSqlite(resultLocation = file.path(testLoc, "ResNet"), - cohortDefinitions = cohortDefinitions) + sqliteFile <- + insertResultsToSqlite(resultLocation = file.path(testLoc, "ResNet"), + cohortDefinitions = cohortDefinitions) sink() - + testthat::expect_true(file.exists(sqliteFile)) - - cdmDatabaseSchema <- 'main' - ohdsiDatabaseSchema <- 'main' + + cdmDatabaseSchema <- "main" + ohdsiDatabaseSchema <- "main" connectionDetails <- DatabaseConnector::createConnectionDetails( - dbms = 'sqlite', + dbms = "sqlite", server = sqliteFile ) conn <- DatabaseConnector::connect(connectionDetails = connectionDetails) - targetDialect <- 'sqlite' - + targetDialect <- "sqlite" + # check the results table is populated - sql <- 'select count(*) as N from main.performances;' + sql <- "select count(*) as N from main.performances;" res <- DatabaseConnector::querySql(conn, sql) - testthat::expect_true(res$N[1]>0) + testthat::expect_true(res$N[1] > 0) }) - diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R index debe95c..ec5b063 100644 --- a/tests/testthat/test-TrainingCache.R +++ b/tests/testthat/test-TrainingCache.R @@ -4,75 +4,83 @@ resNetSettings <- setResNet(numLayers = c(1, 2, 4), residualDropout = 0.5, hiddenDropout = 0.5, sizeEmbedding = 64, - estimatorSettings = setEstimator(learningRate=3e-4, - weightDecay=1e-3, - device='cpu', - batchSize=64, - epochs=1, - seed=NULL), + estimatorSettings = + setEstimator(learningRate = 3e-4, + weightDecay = 1e-3, + device = "cpu", + batchSize = 64, + epochs = 1, + seed = NULL), hyperParamSearch = "random", randomSample = 3, randomSampleSeed = NULL) -trainCache <- TrainingCache$new(testLoc) +trainCache <- trainingCache$new(testLoc) paramSearch <- resNetSettings$param test_that("Training cache exists on disk", { testthat::expect_true( - file.exists(file.path(testLoc, "paramPersistence.rds"))) + file.exists(file.path(testLoc, "paramPersistence.rds"))) }) test_that("Grid search can be cached", { gridSearchPredictons <- list() length(gridSearchPredictons) <- length(paramSearch) trainCache$saveGridSearchPredictions(gridSearchPredictons) - + index <- 1 - + gridSearchPredictons[[index]] <- list( prediction = list(NULL), param = paramSearch[[index]] ) trainCache$saveGridSearchPredictions(gridSearchPredictons) - testthat::expect_identical(trainCache$getGridSearchPredictions(), gridSearchPredictons) - testthat::expect_equal(trainCache$getLastGridSearchIndex(), index+1) + testthat::expect_identical(trainCache$getGridSearchPredictions(), + gridSearchPredictons) + testthat::expect_equal(trainCache$getLastGridSearchIndex(), index + 1) }) test_that("Param grid predictions can be cached", { testthat::expect_false(trainCache$isParamGridIdentical(paramSearch)) - + trainCache$saveModelParams(paramSearch) testthat::expect_true(trainCache$isParamGridIdentical(paramSearch)) }) test_that("Estimator can resume training from cache", { trainCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) - newPath <- file.path(testLoc, 'resume') + newPath <- file.path(testLoc, "resume") dir.create(newPath) - + # remove last row trainCache$gridSearchPredictions[[2]] <- NULL length(trainCache$gridSearchPredictions) <- 2 - + # save new cache - saveRDS(trainCache, file=file.path(newPath, "paramPersistence.rds")) - + saveRDS(trainCache, file = file.path(newPath, "paramPersistence.rds")) + sink(nullfile()) - fitEstimatorResults <- fitEstimator(trainData$Train, - modelSettings = modelSettings, - analysisId = 1, + fitEstimatorResults <- fitEstimator(trainData$Train, + modelSettings = modelSettings, + analysisId = 1, analysisPath = newPath) sink() - newCache <- readRDS(file.path(newPath, "paramPersistence.rds")) - testthat::expect_equal(nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary), 4) + cS <- nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary) + testthat::expect_equal(cS, 4) }) test_that("Prediction is cached for optimal parameters", { testCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds")) - indexOfMax <- which.max(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) - indexOfMin <- which.min(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance))) - testthat::expect_equal(class(testCache$gridSearchPredictions[[indexOfMax]]$prediction), class(data.frame())) - testthat::expect_null(testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]]) + indexOfMax <- + which.max(unlist(lapply(testCache$gridSearchPredictions, + function(x) x$gridPerformance$cvPerformance))) + indexOfMin <- + which.min(unlist(lapply(testCache$gridSearchPredictions, + function(x) x$gridPerformance$cvPerformance))) + myClass <- class(testCache$gridSearchPredictions[[indexOfMax]]$prediction) + testthat::expect_equal(myClass, class(data.frame())) + lowestIndex <- testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]] + testthat::expect_null(lowestIndex) }) diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R index 043cbe7..62fdf12 100644 --- a/tests/testthat/test-Transformer.R +++ b/tests/testthat/test-Transformer.R @@ -1,15 +1,15 @@ settings <- setTransformer( - numBlocks = 1, - dimToken = 8, + numBlocks = 1, + dimToken = 8, dimOut = 1, - numHeads = 2, - attDropout = 0.0, + numHeads = 2, + attDropout = 0.0, ffnDropout = 0.2, - resDropout = 0.0, - dimHidden = 32, + resDropout = 0.0, + dimHidden = 32, estimatorSettings = setEstimator(learningRate = 3e-4, - batchSize=64, - epochs=1), + batchSize = 64, + epochs = 1), randomSample = 1 ) @@ -32,7 +32,11 @@ test_that("Transformer settings work", { }) test_that("fitEstimator with Transformer works", { - results <- fitEstimator(trainData$Train, settings, analysisId = 1, analysisPath = testLoc) + results <- + fitEstimator(trainData$Train, + settings, + analysisId = 1, + analysisPath = testLoc) expect_equal(class(results), "plpModel") expect_equal(attr(results, "modelType"), "binary") @@ -44,16 +48,17 @@ test_that("fitEstimator with Transformer works", { }) test_that("transformer nn-module works", { - Transformer <- reticulate::import_from_path("Transformer", path=path)$Transformer - model <- Transformer( - cat_features = 5, - num_features = 1, + transformer <- + reticulate::import_from_path("Transformer", path = path)$Transformer + model <- transformer( + cat_features = 5, + num_features = 1, num_blocks = 2, - dim_token = 16, - num_heads = 2, - att_dropout = 0, + dim_token = 16, + num_heads = 2, + att_dropout = 0, ffn_dropout = 0, - res_dropout = 0, + res_dropout = 0, dim_hidden = 32 ) @@ -73,15 +78,15 @@ test_that("transformer nn-module works", { input$num <- NULL - model <- Transformer( - cat_features = 5, - num_features = 0, + model <- transformer( + cat_features = 5, + num_features = 0, num_blocks = 2, - dim_token = 16, - num_heads = 2, - att_dropout = 0, + dim_token = 16, + num_heads = 2, + att_dropout = 0, ffn_dropout = 0, - res_dropout = 0, + res_dropout = 0, dim_hidden = 32 ) output <- model(input) @@ -89,26 +94,29 @@ test_that("transformer nn-module works", { input$num <- reticulate::py_none() output <- model(input) expect_equal(output$shape[0], 10L) + input$num <- reticulate::py_none() + output <- model(input) + expect_equal(output$shape[0], 10L) }) test_that("Default Transformer works", { defaultTransformer <- setDefaultTransformer() params <- defaultTransformer$param[[1]] - + expect_equal(params$numBlocks, 3) expect_equal(params$dimToken, 192) expect_equal(params$numHeads, 8) expect_equal(params$resDropout, 0.0) expect_equal(params$attDropout, 0.2) - - settings <- attr(defaultTransformer, 'settings') - - expect_equal(settings$name, 'defaultTransformer') -}) + + settings <- attr(defaultTransformer, "settings") + + expect_equal(settings$name, "defaultTransformer") +}) test_that("Errors are produced by settings function", { randomSample <- 2 - + expect_error(setTransformer(randomSample = randomSample)) }) @@ -126,7 +134,37 @@ test_that("dimHidden ratio works as expected", { testthat::expect_error(setTransformer(dimHidden = NULL, dimHiddenRatio = NULL)) testthat::expect_error(setTransformer(dimHidden = 256, - dimHiddenRatio = 4/3)) + dimHiddenRatio = 4 / 3)) + +}) + +test_that("numerical embedding works as expected", { + embeddings <- 32L # size of embeddings + features <- 2L # number of numerical features + patients <- 9L + + numTensor <- torch$randn(c(patients, features)) + + numericalEmbeddingClass <- + reticulate::import_from_path("ResNet", path = path)$NumericalEmbedding + numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, + embedding_dim = embeddings, + bias = TRUE) + out <- numericalEmbedding(numTensor) + + # should be patients x features x embedding size + expect_equal(out$shape[[0]], patients) + expect_equal(out$shape[[1]], features) + expect_equal(out$shape[[2]], embeddings) + + numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features, + embedding_dim = embeddings, + bias = FALSE) + + out <- numericalEmbedding(numTensor) + expect_equal(out$shape[[0]], patients) + expect_equal(out$shape[[1]], features) + expect_equal(out$shape[[2]], embeddings) })