diff --git a/.Rbuildignore b/.Rbuildignore
index ac33fdc..307ed11 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -8,3 +8,4 @@
 ^extras$
 ^deploy.sh$
 ^compare_versions$
+^.mypy_cache$
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 94cb359..feec6ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,4 +8,6 @@ docs
 .idea/
 renv.lock
 extras/
-.Renviron
\ No newline at end of file
+.Renviron
+inst/python/__pycache__
+.mypy_cache
diff --git a/DESCRIPTION b/DESCRIPTION
index d4c1a53..91bb9a3 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 2.0.1
+Version: 2.0.2
 Date: 18-04-2023
 Authors@R: c(
     person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")),
@@ -35,7 +35,8 @@ Suggests:
     testthat,
     PRROC,
     ResultModelManager (>= 0.2.0),
-    DatabaseConnector (>= 6.0.0)
+    DatabaseConnector (>= 6.0.0),
+    Andromeda
 Remotes:
     ohdsi/PatientLevelPrediction,
     ohdsi/FeatureExtraction,
@@ -51,7 +52,6 @@ Config/reticulate:
       list(package = "polars"),
       list(package = "tqdm"),
       list(package = "connectorx"),
-      list(package = "scikit-learn"),
       list(package = "pyarrow")
       )
   )
diff --git a/NAMESPACE b/NAMESPACE
index 4806f0d..5f8c13e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-export(TrainingCache)
 export(fitEstimator)
 export(gridCvDeep)
 export(predictDeepEstimator)
@@ -10,6 +9,7 @@ export(setEstimator)
 export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
+export(trainingCache)
 importFrom(dplyr,"%>%")
 importFrom(reticulate,py_to_r)
 importFrom(reticulate,r_to_py)
diff --git a/NEWS.md b/NEWS.md
index 70ccf12..961ac75 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,13 @@
+DeepPatientLevelPrediction 2.0.2
+======================
+  - Ensure output from predict_proba is numeric instead of 1d array
+  - Refactoring: Move cross-validation to a separate function
+  - Refactoring: Move paramsToTune to a separate function 
+  - linting: Enforcing HADES style
+  - Calculate AUC ourselves with torch, get rid of scikit-learn dependancy
+  - added Andromeda to dev dependencies
+
+
 DeepPatientLevelPrediction 2.0.1
 ======================
   - Connection parameter fixed to be in line with newest polars
diff --git a/R/Dataset.R b/R/Dataset.R
index f4b5170..4ba4e81 100644
--- a/R/Dataset.R
+++ b/R/Dataset.R
@@ -15,21 +15,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-createDataset <- function(data, labels, plpModel=NULL) {
+createDataset <- function(data, labels, plpModel = NULL) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  Dataset <- reticulate::import_from_path("Dataset", path = path)$Data
+  dataset <- reticulate::import_from_path("Dataset", path = path)$Data
   if (is.null(attributes(data)$path)) {
     # sqlite object
     attributes(data)$path <- attributes(data)$dbname
   }
   if (is.null(plpModel)) {
-    data <- Dataset(r_to_py(normalizePath(attributes(data)$path)), 
-                            r_to_py(labels$outcomeCount))
+    data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
+                    r_to_py(labels$outcomeCount))
+  } else {
+    numericalFeatures <-
+      r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))
+    data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
+                    numerical_features = numericalFeatures)
   }
-  else {
-    data <- Dataset(r_to_py(normalizePath(attributes(data)$path)),
-                    numerical_features = r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))  )
-  }
-  
-  return(data)  
-}
\ No newline at end of file
+
+  return(data)
+}
diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
index e77812e..137b53a 100644
--- a/R/DeepPatientLevelPrediction.R
+++ b/R/DeepPatientLevelPrediction.R
@@ -18,7 +18,8 @@
 
 #' DeepPatientLevelPrediction
 #'
-#' @description A package containing deep learning extensions for developing prediction models using data in the OMOP CDM
+#' @description A package containing deep learning extensions for developing
+#' prediction models using data in the OMOP CDM
 #'
 #' @docType package
 #' @name DeepPatientLevelPrediction
@@ -28,9 +29,7 @@
 NULL
 
 .onLoad <- function(libname, pkgname) {
-  # use superassignment to update global reference 
+  # use superassignment to update global reference
   reticulate::configure_environment(pkgname)
   torch <<- reticulate::import("torch", delay_load = TRUE)
 }
-
-
diff --git a/R/Estimator.R b/R/Estimator.R
index 7705279..4ef2047 100644
--- a/R/Estimator.R
+++ b/R/Estimator.R
@@ -26,37 +26,41 @@
 #' @param weightDecay what weight_decay to use
 #' @param batchSize batchSize to use
 #' @param epochs  how many epochs to train for
-#' @param device  what device to train on, can be a string or a function to that evaluates
+#' @param device  what device to train on, can be a string or a function to
+#' that evaluates
 #' to the device during runtime
 #' @param optimizer which optimizer to use
 #' @param scheduler which learning rate scheduler to use
 #' @param criterion loss function to use
-#' @param earlyStopping If earlyStopping should be used which stops the training of your metric is not improving
-#' @param metric either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. 
-#' Needs to be a list with function `fun`, mode either `min` or `max` and a `name`,
-#' `fun` needs to be a function that takes in prediction and labels and outputs a score.
+#' @param earlyStopping If earlyStopping should be used which stops the
+#' training of your metric is not improving
+#' @param metric either `auc` or `loss` or a custom metric to use. This is the
+#' metric used for scheduler and earlyStopping.
+#' Needs to be a list with function `fun`, mode either `min` or `max` and a
+#' `name`,
+#' `fun` needs to be a function that takes in prediction and labels and
+#' outputs a score.
 #' @param seed seed to initialize weights of model with
 #' @export
-setEstimator <- function(learningRate='auto',
-                         weightDecay = 0.0,
-                         batchSize = 512,
-                         epochs = 30,
-                         device='cpu',
-                         optimizer = torch$optim$AdamW,
-                         scheduler = list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau,
-                                          params=list(patience=1)),
-                         criterion = torch$nn$BCEWithLogitsLoss,
-                         earlyStopping = list(useEarlyStopping=TRUE,
-                                              params = list(patience=4)),
-                         metric = "auc",
-                         seed = NULL
+setEstimator <- function(learningRate = "auto",
+  weightDecay = 0.0,
+  batchSize = 512,
+  epochs = 30,
+  device = "cpu",
+  optimizer = torch$optim$AdamW,
+  scheduler = list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau,
+                   params = list(patience = 1)),
+  criterion = torch$nn$BCEWithLogitsLoss,
+  earlyStopping = list(useEarlyStopping = TRUE,
+                       params = list(patience = 4)),
+  metric = "auc",
+  seed = NULL
 ) {
-  
+
   checkIsClass(learningRate, c("numeric", "character"))
-  if (inherits(learningRate, "character")) {
-    if (learningRate != "auto"){
-      stop(paste0('Learning rate should be either a numeric or "auto", you provided: ', learningRate))
-    }
+  if (inherits(learningRate, "character") && learningRate != "auto") {
+    stop(paste0('Learning rate should be either a numeric or "auto", 
+                you provided: ', learningRate))
   }
   checkIsClass(weightDecay, "numeric")
   checkHigherEqual(weightDecay, 0.0)
@@ -67,58 +71,50 @@ setEstimator <- function(learningRate='auto',
   checkIsClass(earlyStopping, c("list", "NULL"))
   checkIsClass(metric, c("character", "list"))
   checkIsClass(seed, c("numeric", "integer", "NULL"))
-  
-  
-  if (length(learningRate)==1 && learningRate=='auto') {findLR <- TRUE} else {findLR <- FALSE}
+
+
+  if (length(learningRate) == 1 && learningRate == "auto") {
+    findLR <- TRUE
+  } else {
+    findLR <- FALSE
+  }
   if (is.null(seed)) {
     seed <- as.integer(sample(1e5, 1))
   }
-  estimatorSettings <- list(learningRate=learningRate,
-                            weightDecay=weightDecay,
-                            batchSize=batchSize,
-                            epochs=epochs,
-                            device=device,
-                            earlyStopping=earlyStopping,
-                            findLR=findLR,
-                            metric=metric,
-                            seed=seed[1])
-  
-  optimizer <- rlang::enquo(optimizer) 
+  estimatorSettings <- list(learningRate = learningRate,
+                            weightDecay = weightDecay,
+                            batchSize = batchSize,
+                            epochs = epochs,
+                            device = device,
+                            earlyStopping = earlyStopping,
+                            findLR = findLR,
+                            metric = metric,
+                            seed = seed[1])
+
+  optimizer <- rlang::enquo(optimizer)
   estimatorSettings$optimizer <- function() rlang::eval_tidy(optimizer)
-  class(estimatorSettings$optimizer) <- c("delayed", class(estimatorSettings$optimizer))
-  
+  class(estimatorSettings$optimizer) <- c("delayed",
+                                          class(estimatorSettings$optimizer))
+
   criterion <- rlang::enquo(criterion)
   estimatorSettings$criterion <- function() rlang::eval_tidy(criterion)
-  class(estimatorSettings$criterion) <- c("delayed", class(estimatorSettings$criterion))
-  
+  class(estimatorSettings$criterion) <- c("delayed",
+                                          class(estimatorSettings$criterion))
+
   scheduler <- rlang::enquo(scheduler)
   estimatorSettings$scheduler <- function() rlang::eval_tidy(scheduler)
-  class(estimatorSettings$scheduler) <-c("delayed", class(estimatorSettings$scheduler))
+  class(estimatorSettings$scheduler) <- c("delayed",
+                                          class(estimatorSettings$scheduler))
 
   if (is.function(device)) {
-    class(estimatorSettings$device) <- c("delayed",  class(estimatorSettings$device))
+    class(estimatorSettings$device) <- c("delayed",
+                                         class(estimatorSettings$device))
   }
-  
-  paramsToTune <- list()
-  for (name in names(estimatorSettings)) {
-    param <- estimatorSettings[[name]]
-    if (length(param) > 1 && is.atomic(param)) {
-      paramsToTune[[paste0('estimator.',name)]] <- param
-    }
-    if ("params" %in% names(param)) {
-      for (name2 in names(param[["params"]])) {
-        param2 <- param[["params"]][[name2]]
-        if (length(param2) > 1) {
-          paramsToTune[[paste0('estimator.',name,'.',name2)]] <- param2
-        }
-      }
-    }
-  }
-  estimatorSettings$paramsToTune <- paramsToTune
 
+  estimatorSettings$paramsToTune <- extractParamsToTune(estimatorSettings)
   return(estimatorSettings)
 }
- 
+
 #' fitEstimator
 #'
 #' @description
@@ -137,12 +133,12 @@ fitEstimator <- function(trainData,
                          analysisPath,
                          ...) {
   start <- Sys.time()
-  
+
   # check covariate data
   if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
     stop("Needs correct covariateData")
   }
-  
+
   if (!is.null(trainData$folds)) {
     trainData$labels <- merge(trainData$labels, trainData$fold, by = "rowId")
   }
@@ -150,9 +146,9 @@ fitEstimator <- function(trainData,
     covariateData = trainData$covariateData,
     cohort = trainData$labels
   )
-  
+
   covariateRef <- mappedCovariateData$covariateRef
-  
+
   outLoc <- PatientLevelPrediction::createTempModelLoc()
   cvResult <- do.call(
     what = gridCvDeep,
@@ -164,10 +160,12 @@ fitEstimator <- function(trainData,
       analysisPath = analysisPath
     )
   )
-  
-  hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary))
+
+  hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch,
+                                        function(x) x$hyperSummary))
   prediction <- cvResult$prediction
-  incs <- rep(1, covariateRef %>% dplyr::tally() %>% 
+  incs <- rep(1, covariateRef %>%
+                dplyr::tally() %>%
                 dplyr::collect() %>%
                 as.integer())
   covariateRef <- covariateRef %>%
@@ -178,25 +176,30 @@ fitEstimator <- function(trainData,
       covariateValue = 0,
       isNumeric = .data$columnId %in% cvResult$numericalIndex
     )
-  
+
   comp <- start - Sys.time()
   result <- list(
-    model = cvResult$estimator, # file.path(outLoc),
-    
+    model = cvResult$estimator,
+
     preprocessing = list(
-      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
-      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
-      requireDenseMatrix = F
+      featureEngineering = attr(trainData$covariateData,
+                                "metaData")$featureEngineering,
+      tidyCovariates = attr(trainData$covariateData,
+                            "metaData")$tidyCovariateDataSettings,
+      requireDenseMatrix = FALSE
     ),
     prediction = prediction,
     modelDesign = PatientLevelPrediction::createModelDesign(
       targetId = attr(trainData, "metaData")$targetId,
       outcomeId = attr(trainData, "metaData")$outcomeId,
-      restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings,
+      restrictPlpDataSettings = attr(trainData,
+                                     "metaData")$restrictPlpDataSettings,
       covariateSettings = attr(trainData, "metaData")$covariateSettings,
       populationSettings = attr(trainData, "metaData")$populationSettings,
-      featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings,
-      preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings,
+      featureEngineeringSettings = attr(trainData$covariateData,
+                                        "metaData")$featureEngineeringSettings,
+      preprocessSettings = attr(trainData$covariateData,
+                                "metaData")$preprocessSettings,
       modelSettings = modelSettings,
       splitSettings = attr(trainData, "metaData")$splitSettings,
       sampleSettings = attr(trainData, "metaData")$sampleSettings
@@ -214,12 +217,12 @@ fitEstimator <- function(trainData,
     ),
     covariateImportance = covariateRef
   )
-  
+
   class(result) <- "plpModel"
   attr(result, "predictionFunction") <- "predictDeepEstimator"
   attr(result, "modelType") <- "binary"
   attr(result, "saveType") <- modelSettings$saveType
-  
+
   return(result)
 }
 
@@ -242,32 +245,31 @@ predictDeepEstimator <- function(plpModel,
   }
   if ("plpData" %in% class(data)) {
     mappedData <- PatientLevelPrediction::MapIds(data$covariateData,
-                                                 cohort = cohort,
-                                                 mapping = plpModel$covariateImportance %>%
-                                                   dplyr::select(
-                                                     "columnId",
-                                                     "covariateId"
-                                                   )
+      cohort = cohort,
+      mapping = plpModel$covariateImportance %>%
+        dplyr::select("columnId", "covariateId")
     )
-    data <- createDataset(mappedData, plpModel=plpModel)
+    data <- createDataset(mappedData, plpModel = plpModel)
   }
-  
+
   # get predictions
   prediction <- cohort
   if (is.character(plpModel$model)) {
     modelSettings <- plpModel$modelDesign$modelSettings
-    model <- torch$load(file.path(plpModel$model, "DeepEstimatorModel.pt"), map_location = "cpu")
-    estimator <- createEstimator(modelType=modelSettings$modelType,
-                                 modelParameters=model$model_parameters,
-                                 estimatorSettings=model$estimator_settings)
+    model <- torch$load(file.path(plpModel$model,
+                                  "DeepEstimatorModel.pt"),
+                        map_location = "cpu")
+    estimator <- createEstimator(modelType = modelSettings$modelType,
+                                 modelParameters = model$model_parameters,
+                                 estimatorSettings = model$estimator_settings)
     estimator$model$load_state_dict(model$model_state_dict)
     prediction$value <- estimator$predict_proba(data)
   } else {
     prediction$value <- plpModel$model$predict_proba(data)
   }
-  
+  prediction$value <- as.numeric(prediction$value)
   attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType")
-  
+
   return(prediction)
 }
 
@@ -289,15 +291,16 @@ gridCvDeep <- function(mappedData,
                        modelSettings,
                        modelLocation,
                        analysisPath) {
-  ParallelLogger::logInfo(paste0("Running hyperparameter search for ", modelSettings$modelType, " model"))
-  
+  ParallelLogger::logInfo(paste0("Running hyperparameter search for ",
+                                 modelSettings$modelType, " model"))
+
   ###########################################################################
-  
+
   paramSearch <- modelSettings$param
-  
+
   # TODO below chunk should be in a setupCache function
-  trainCache <- TrainingCache$new(analysisPath)
-    if (trainCache$isParamGridIdentical(paramSearch)) {
+  trainCache <- trainingCache$new(analysisPath)
+  if (trainCache$isParamGridIdentical(paramSearch)) {
     gridSearchPredictons <- trainCache$getGridSearchPredictions()
   } else {
     gridSearchPredictons <- list()
@@ -305,123 +308,114 @@ gridCvDeep <- function(mappedData,
     trainCache$saveGridSearchPredictions(gridSearchPredictons)
     trainCache$saveModelParams(paramSearch)
   }
-  
-  dataset <- createDataset(data=mappedData, labels=labels)
-  
-  fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))]
+
+  dataset <- createDataset(data = mappedData, labels = labels)
+
+  fitParams <- names(paramSearch[[1]])[grepl("^estimator",
+                                             names(paramSearch[[1]]))]
   findLR <- modelSettings$estimatorSettings$findLR
   if (!trainCache$isFull()) {
     for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
-    ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId))
-    ParallelLogger::logInfo(paste0("HyperParameters: "))
-    ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | "))
-    currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames]
-    
-    currentEstimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, 
-                                               paramSearch[[gridId]])
-    
-    # initiate prediction
-    prediction <- NULL
-    
-    fold <- labels$index
-    ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
-    currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
-    currentModelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
-    if (findLR) {
-      LrFinder <- createLRFinder(modelType = modelSettings$modelType,
-                                 modelParameters = currentModelParams,
-                                 estimatorSettings = currentEstimatorSettings
-                                 )
-      lr <- LrFinder$get_lr(dataset)
-      ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
-      currentEstimatorSettings$learningRate <- lr
-    }
-    
-    learnRates <- list()
-    for (i in 1:max(fold)) {
-      ParallelLogger::logInfo(paste0("Fold ", i))
-      trainDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold != i) - 1)) # -1 for python 0-based indexing
-      testDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold == i) -1)) # -1 for python 0-based indexing
-    
-      estimator <- createEstimator(modelType=modelSettings$modelType,
-                                   modelParameters=currentModelParams,
-                                   estimatorSettings=currentEstimatorSettings)
-      estimator$fit(trainDataset, testDataset)
-      
-      ParallelLogger::logInfo("Calculating predictions on left out fold set...")
-      
-      prediction <- rbind(
-        prediction,
-        predictDeepEstimator(
-          plpModel = estimator,
-          data = testDataset,
-          cohort = labels[fold == i, ]
+      ParallelLogger::logInfo(paste0("Running hyperparameter combination no ",
+                                     gridId))
+      ParallelLogger::logInfo(paste0("HyperParameters: "))
+      ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]),
+                                    paramSearch[[gridId]], collapse = " | "))
+      currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames]
+
+      currentEstimatorSettings <-
+        fillEstimatorSettings(modelSettings$estimatorSettings,
+                              fitParams,
+                              paramSearch[[gridId]])
+      currentEstimatorSettings$modelType <- modelSettings$modelType
+      currentModelParams$catFeatures <- dataset$get_cat_features()$max()
+      currentModelParams$numFeatures <-
+        dataset$get_numerical_features()$max()
+      if (findLR) {
+        lrFinder <- createLRFinder(modelType = modelSettings$modelType,
+          modelParameters = currentModelParams,
+          estimatorSettings = currentEstimatorSettings
         )
+        lr <- lrFinder$get_lr(dataset)
+        ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
+        currentEstimatorSettings$learningRate <- lr
+      }
+
+      crossValidationResults <-
+        doCrossvalidation(dataset,
+                          labels = labels,
+                          modelSettings = currentModelParams,
+                          estimatorSettings = currentEstimatorSettings)
+      learnRates <- crossValidationResults$learnRates
+      prediction <- crossValidationResults$prediction
+
+      gridPerformance <-
+        PatientLevelPrediction::computeGridPerformance(prediction,
+                                                       paramSearch[[gridId]])
+      maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
+      gridSearchPredictons[[gridId]] <- list(
+        prediction = prediction,
+        param = paramSearch[[gridId]],
+        gridPerformance = gridPerformance
       )
-      learnRates[[i]] <- list(
-        LRs = estimator$learn_rate_schedule,
-        bestEpoch = estimator$best_epoch
-      )
-    }
-    maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
-    gridSearchPredictons[[gridId]] <- list(
-      prediction = prediction,
-      param = paramSearch[[gridId]],
-      gridPerformance =  PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]])
-    )
-    gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), 
-                                                                                  nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))   
-    gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]]
-    
-
-    # remove all predictions that are not the max performance
-    indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
-    for (i in seq_along(gridSearchPredictons)) {
-      if (!is.null(gridSearchPredictons[[i]])) {
-        if (i != indexOfMax) {
+      gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <-
+        rep(list(unlist(learnRates[[maxIndex]]$LRs)),
+            nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))
+      gridSearchPredictons[[gridId]]$param$learnSchedule <-
+        learnRates[[maxIndex]]
+      # remove all predictions that are not the max performance
+      indexOfMax <-
+        which.max(unlist(lapply(gridSearchPredictons,
+                                function(x) x$gridPerformance$cvPerformance)))
+      for (i in seq_along(gridSearchPredictons)) {
+        if (!is.null(gridSearchPredictons[[i]]) && i != indexOfMax) {
           gridSearchPredictons[[i]]$prediction <- list(NULL)
         }
       }
+      ParallelLogger::logInfo(paste0("Caching all grid search results and 
+                                     prediction for best combination ",
+                                     indexOfMax))
+      trainCache$saveGridSearchPredictions(gridSearchPredictons)
     }
-    ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax))
-    trainCache$saveGridSearchPredictions(gridSearchPredictons)
-  }
   }
   paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
   # get best params
-  indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
+  indexOfMax <-
+    which.max(unlist(lapply(gridSearchPredictons,
+                            function(x) x$gridPerformance$cvPerformance)))
   finalParam <- gridSearchPredictons[[indexOfMax]]$param
 
   paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
-  
+
   # get best CV prediction
   cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction
   cvPrediction$evaluationType <- "CV"
-  
+
   ParallelLogger::logInfo("Training final model using optimal parameters")
   # get the params
   modelParams <- finalParam[modelSettings$modelParamNames]
-  
-  
+
+
   # create the dir
   if (!dir.exists(file.path(modelLocation))) {
-    dir.create(file.path(modelLocation), recursive = T)
+    dir.create(file.path(modelLocation), recursive = TRUE)
   }
-  
-  modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
-  modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
-  
-  
-  estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams,
+
+  modelParams$catFeatures <- dataset$get_cat_features()$max()
+  modelParams$numFeatures <- dataset$get_numerical_features()$max()
+
+
+  estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings,
+                                             fitParams,
                                              finalParam)
   estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]]
   estimator <- createEstimator(modelType = modelSettings$modelType,
                                modelParameters = modelParams,
                                estimatorSettings = estimatorSettings)
- 
+
   numericalIndex <- dataset$get_numerical_features()
   estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs)
-  
+
   ParallelLogger::logInfo("Calculating predictions on all train data...")
   prediction <- predictDeepEstimator(
     plpModel = estimator,
@@ -429,7 +423,7 @@ gridCvDeep <- function(mappedData,
     cohort = labels
   )
   prediction$evaluationType <- "Train"
-  
+
   prediction <- rbind(
     prediction,
     cvPrediction
@@ -438,9 +432,10 @@ gridCvDeep <- function(mappedData,
   prediction <- prediction %>%
     dplyr::select(-"index")
 
-  prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, origin = "1970-01-01")
-  
-  
+  prediction$cohortStartDate <- as.Date(prediction$cohortStartDate,
+                                        origin = "1970-01-01")
+
+
   # save torch code here
   estimator$save(modelLocation, "DeepEstimatorModel.pt")
   return(
@@ -454,23 +449,25 @@ gridCvDeep <- function(mappedData,
   )
 }
 
-# utility function to add instances of parameters to estimatorSettings during grid search
+# utility function to add instances of parameters to estimatorSettings
+# during grid search
 fillEstimatorSettings <- function(estimatorSettings, fitParams, paramSearch) {
   for (fp in fitParams) {
     components <- strsplit(fp, "[.]")[[1]]
-    
-    if (length(components)==2) {
+
+    if (length(components) == 2) {
       estimatorSettings[[components[[2]]]] <- paramSearch[[fp]]
     } else {
-      estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <- paramSearch[[fp]]
+      estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <-
+        paramSearch[[fp]]
     }
   }
   return(estimatorSettings)
 }
 
-# utility function to evaluate any expressions or call functions passed as settings
+# utility function to evaluate any expressions or call functions passed as
+# settings
 evalEstimatorSettings <- function(estimatorSettings) {
-  
   for (set in names(estimatorSettings)) {
     if (inherits(estimatorSettings[[set]], "delayed")) {
       estimatorSettings[[set]] <- estimatorSettings[[set]]()
@@ -484,16 +481,79 @@ createEstimator <- function(modelType,
                             estimatorSettings) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
 
-  Model <- reticulate::import_from_path(modelType, path=path)[[modelType]]
-  
-  Estimator <- reticulate::import_from_path("Estimator", path=path)$Estimator
-  
+  model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
+  estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator
+
   modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
   estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
   estimatorSettings <- evalEstimatorSettings(estimatorSettings)
-  
-  estimator <- Estimator(model = Model,
+
+  estimator <- estimator(model = model,
                          model_parameters = modelParameters,
                          estimator_settings = estimatorSettings)
   return(estimator)
-}
\ No newline at end of file
+}
+
+doCrossvalidation <- function(dataset,
+                              labels,
+                              modelSettings,
+                              estimatorSettings) {
+  fold <- labels$index
+  ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
+  learnRates <- list()
+  prediction <- NULL
+  for (i in 1:max(fold)) {
+    ParallelLogger::logInfo(paste0("Fold ", i))
+
+    # -1 for python 0-based indexing
+    trainDataset <- torch$utils$data$Subset(dataset,
+                                            indices =
+                                              as.integer(which(fold != i) - 1))
+
+    # -1 for python 0-based indexing
+    testDataset <- torch$utils$data$Subset(dataset,
+                                           indices =
+                                             as.integer(which(fold == i) - 1))
+    estimator <- createEstimator(modelType = estimatorSettings$modelType,
+                                 modelParameters = modelSettings,
+                                 estimatorSettings = estimatorSettings)
+    estimator$fit(trainDataset, testDataset)
+
+    ParallelLogger::logInfo("Calculating predictions on left out fold set...")
+
+    prediction <- rbind(
+      prediction,
+      predictDeepEstimator(
+        plpModel = estimator,
+        data = testDataset,
+        cohort = labels[fold == i, ]
+      )
+    )
+    learnRates[[i]] <- list(
+      LRs = estimator$learn_rate_schedule,
+      bestEpoch = estimator$best_epoch
+    )
+  }
+  return(results = list(prediction = prediction,
+                        learnRates = learnRates))
+
+}
+
+extractParamsToTune <- function(estimatorSettings) {
+  paramsToTune <- list()
+  for (name in names(estimatorSettings)) {
+    param <- estimatorSettings[[name]]
+    if (length(param) > 1 && is.atomic(param)) {
+      paramsToTune[[paste0("estimator.", name)]] <- param
+    }
+    if ("params" %in% names(param)) {
+      for (name2 in names(param[["params"]])) {
+        param2 <- param[["params"]][[name2]]
+        if (length(param2) > 1) {
+          paramsToTune[[paste0("estimator.", name, ".", name2)]] <- param2
+        }
+      }
+    }
+  }
+  return(paramsToTune)
+}
diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
index 350baf9..a1aa43c 100644
--- a/R/HelperFunctions.R
+++ b/R/HelperFunctions.R
@@ -41,40 +41,40 @@ camelCaseToSnakeCaseNames <- function(object) {
 }
 
 #' helper function to check class of input
-#' 
+#'
 #' @param parameter the input parameter to check
 #' @param classes which classes it should belong to (one or more)
-checkIsClass<- function(parameter,classes) {
-  name = deparse(substitute(parameter))
+checkIsClass <- function(parameter, classes) {
+  name <- deparse(substitute(parameter))
   if (!inherits(x = parameter, what = classes)) {
-    ParallelLogger::logError(paste0(name, ' should be of class:', classes, ' '))      
-    stop(paste0(name, ' is wrong class'))
+    ParallelLogger::logError(paste0(name, " should be of class:", classes, " "))
+    stop(paste0(name, " is wrong class"))
   }
   return(TRUE)
 }
 
 #' helper function to check that input is higher than a certain value
-#' 
+#'
 #' @param parameter the input parameter to check, can be a vector
 #' @param value which value it should be higher than
-checkHigher <- function(parameter,value) {
-  name = deparse(substitute(parameter))
-  if (!is.numeric(parameter) | all(parameter<=value)) {
-    ParallelLogger::logError(paste0(name, ' needs to be > ',value))      
-    stop(paste0(name, ' needs to be > ', value))
+checkHigher <- function(parameter, value) {
+  name <- deparse(substitute(parameter))
+  if (!is.numeric(parameter) || all(parameter == value)) {
+    ParallelLogger::logError(paste0(name, " needs to be > ", value))
+    stop(paste0(name, " needs to be > ", value))
   }
   return(TRUE)
 }
 
 #' helper function to check that input is higher or equal than a certain value
-#' 
+#'
 #' @param parameter the input parameter to check, can be a vector
 #' @param value which value it should be higher or equal than
-checkHigherEqual <- function(parameter,value) {
-  name = deparse(substitute(parameter))
-  if (!is.numeric(parameter) | all(parameter<value)) {
-    ParallelLogger::logError(paste0(name, ' needs to be >= ',value))      
-    stop(paste0(name, ' needs to be >= ', value))
+checkHigherEqual <- function(parameter, value) {
+  name <- deparse(substitute(parameter))
+  if (!is.numeric(parameter) || all(parameter < value)) {
+    ParallelLogger::logError(paste0(name, " needs to be >= ", value))
+    stop(paste0(name, " needs to be >= ", value))
   }
   return(TRUE)
-} 
\ No newline at end of file
+}
diff --git a/R/LRFinder.R b/R/LRFinder.R
index dfe8ee3..4ab006c 100644
--- a/R/LRFinder.R
+++ b/R/LRFinder.R
@@ -18,22 +18,26 @@
 createLRFinder <- function(modelType,
                            modelParameters,
                            estimatorSettings,
-                           lrSettings=NULL) {
+                           lrSettings = NULL) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  LRFinderClass <- reticulate::import_from_path("LrFinder", path = path)$LrFinder
+  lrFinderClass <-
+    reticulate::import_from_path("LrFinder", path = path)$LrFinder
   
-  model <- reticulate::import_from_path(modelType, path=path)[[modelType]]
   
+
+  model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
   modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
   estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
+  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
+  
+  # estimator <- createEstimator(modelType = estimatorSettings$modelType,
+  #                              modelParameters = modelParameters,
+  #                              estimatorSettings = estimatorSettings)
   if (!is.null(lrSettings)) {
     lrSettings <- camelCaseToSnakeCaseNames(lrSettings)
   }
   
-  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
-  
-  
-  lrFinder <- LRFinderClass(model=model,
+  lrFinder <- lrFinderClass(model = model,
                             model_parameters = modelParameters,
                             estimator_settings = estimatorSettings,
                             lr_settings = lrSettings)
diff --git a/R/MLP.R b/R/MLP.R
index 6ad35e6..771e244 100644
--- a/R/MLP.R
+++ b/R/MLP.R
@@ -26,12 +26,15 @@
 #'
 #'
 #' @param numLayers         Number of layers in network, default: 1:8
-#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)
-#' @param dropout           How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)
-#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9) (64 to 512)
+#' @param sizeHidden        Amount of neurons in each default layer,
+#' default: 2^(6:9) (64 to 512)
+#' @param dropout           How much dropout to apply after first linear,
+#' default: seq(0, 0.3, 0.05)
+#' @param sizeEmbedding     Size of embedding default: 2^(6:9) (64 to 512)
 #' @param estimatorSettings settings of Estimator created with `setEstimator`
-#' @param hyperParamSearch  Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'
-#' @param randomSample      How many random samples from hyperparameter space to use
+#' @param hyperParamSearch  Which kind of hyperparameter search to use random
+#' sampling or exhaustive grid search. default: 'random'
+#' @param randomSample How many random samples from hyperparameter space to use
 #' @param randomSampleSeed  Random seed to sample hyperparameter combinations
 #'
 #' @export
@@ -39,35 +42,34 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
                                     sizeHidden = c(2^(6:9)),
                                     dropout = c(seq(0, 0.3, 0.05)),
                                     sizeEmbedding = c(2^(6:9)),
-                                    estimatorSettings = setEstimator(
-                                      learningRate = 'auto',
-                                      weightDecay = c(1e-6, 1e-3),
-                                      batchSize = 1024,
-                                      epochs = 30,
-                                      device="cpu"),
+                                    estimatorSettings =
+                                      setEstimator(learningRate = "auto",
+                                                   weightDecay = c(1e-6, 1e-3),
+                                                   batchSize = 1024,
+                                                   epochs = 30,
+                                                   device = "cpu"),
                                     hyperParamSearch = "random",
                                     randomSample = 100,
                                     randomSampleSeed = NULL) {
-  
   checkIsClass(numLayers, c("integer", "numeric"))
   checkHigherEqual(numLayers, 1)
 
   checkIsClass(sizeHidden, c("integer", "numeric"))
   checkHigherEqual(sizeHidden, 1)
-  
+
   checkIsClass(dropout, c("numeric"))
   checkHigherEqual(dropout, 0)
-  
+
   checkIsClass(sizeEmbedding, c("numeric", "integer"))
   checkHigherEqual(sizeEmbedding, 1)
-  
+
   checkIsClass(hyperParamSearch, "character")
-  
+
   checkIsClass(randomSample, c("numeric", "integer"))
   checkHigherEqual(randomSample, 1)
-  
+
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
-  
+
   paramGrid <- list(
     numLayers = numLayers,
     sizeHidden = sizeHidden,
@@ -78,16 +80,20 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
   paramGrid <- c(paramGrid, estimatorSettings$paramsToTune)
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
-  if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.",
-               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
+  if (hyperParamSearch == "random" && randomSample > length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the 
+               amount of possible hyperparameter combinations.",
+               "\n randomSample:", randomSample, "\n Possible hyperparameter 
+               combinations:", length(param),
                "\n Please lower the amount of randomSamples"))
   }
 
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed,
+                                      {param <- param[sample(length(param),
+                                                             randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "MLP"
+  attr(param, "settings")$modelType <- "MLP"
 
   results <- list(
     fitFunction = "fitEstimator",
diff --git a/R/ResNet.R b/R/ResNet.R
index d20aa1d..32e7f3a 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -22,18 +22,19 @@
 #' Creates settings for a default ResNet model
 #'
 #' @details
-#' Model architecture from by https://arxiv.org/abs/2106.11959 . 
+#' Model architecture from by https://arxiv.org/abs/2106.11959 .
 #' Hyperparameters chosen by a experience on a few prediction problems.
 #'
 #' @param estimatorSettings created with ```setEstimator```
 
 #' @export
-setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto',
-                                                            weightDecay=1e-6,
-                                                            device='cpu',
-                                                            batchSize=1024,
-                                                            epochs=50,
-                                                            seed=NULL)) {
+setDefaultResNet <- function(estimatorSettings =
+                               setEstimator(learningRate = "auto",
+                                            weightDecay = 1e-6,
+                                            device = "cpu",
+                                            batchSize = 1024,
+                                            epochs = 50,
+                                            seed = NULL)) {
   resnetSettings <- setResNet(numLayers = 6,
                               sizeHidden = 512,
                               hiddenFactor = 2,
@@ -41,9 +42,9 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto',
                               hiddenDropout = 0.4,
                               sizeEmbedding = 256,
                               estimatorSettings = estimatorSettings,
-                              hyperParamSearch = 'random',
+                              hyperParamSearch = "random",
                               randomSample = 1)
-  attr(resnetSettings, 'settings')$name <- 'defaultResnet'
+  attr(resnetSettings, "settings")$name <- "defaultResnet"
   return(resnetSettings)
 }
 
@@ -58,14 +59,21 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto',
 #'
 #'
 #' @param numLayers         Number of layers in network, default: 1:16
-#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)
-#' @param hiddenFactor      How much to grow the amount of neurons in each ResLayer, default: 1:4
-#' @param residualDropout   How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)
-#' @param hiddenDropout     How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)
-#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9) (64 to 512)
+#' @param sizeHidden        Amount of neurons in each default layer, default:
+#' 2^(6:10) (64 to 1024)
+#' @param hiddenFactor      How much to grow the amount of neurons in each
+#' ResLayer, default: 1:4
+#' @param residualDropout   How much dropout to apply after last linear layer
+#' in ResLayer, default: seq(0, 0.3, 0.05)
+#' @param hiddenDropout     How much dropout to apply after first linear layer
+#' in ResLayer, default: seq(0, 0.3, 0.05)
+#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9)
+#' '(64 to 512)
 #' @param estimatorSettings created with ```setEstimator```
-#' @param hyperParamSearch  Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'
-#' @param randomSample      How many random samples from hyperparameter space to use
+#' @param hyperParamSearch  Which kind of hyperparameter search to use random
+#' sampling or exhaustive grid search. default: 'random'
+#' @param randomSample      How many random samples from hyperparameter space
+#' to use
 #' @param randomSampleSeed  Random seed to sample hyperparameter combinations
 #' @export
 setResNet <- function(numLayers = c(1:8),
@@ -74,61 +82,62 @@ setResNet <- function(numLayers = c(1:8),
                       residualDropout = c(seq(0, 0.5, 0.05)),
                       hiddenDropout = c(seq(0, 0.5, 0.05)),
                       sizeEmbedding = c(2^(6:9)),
-                      estimatorSettings = setEstimator(learningRate='auto',
-                                                       weightDecay=c(1e-6, 1e-3),
-                                                       device='cpu',
-                                                       batchSize=1024,
-                                                       epochs=30,
-                                                       seed=NULL),
+                      estimatorSettings =
+                        setEstimator(learningRate = "auto",
+                                     weightDecay = c(1e-6, 1e-3),
+                                     device = "cpu",
+                                     batchSize = 1024,
+                                     epochs = 30,
+                                     seed = NULL),
                       hyperParamSearch = "random",
                       randomSample = 100,
-                      randomSampleSeed = NULL) 
-{
+                      randomSampleSeed = NULL) {
   checkIsClass(numLayers, c("integer", "numeric"))
   checkHigherEqual(numLayers, 1)
-  
+
   checkIsClass(sizeHidden, c("integer", "numeric"))
   checkHigherEqual(sizeHidden, 1)
-  
+
   checkIsClass(residualDropout, "numeric")
   checkHigherEqual(residualDropout, 0)
-  
+
   checkIsClass(hiddenDropout, "numeric")
   checkHigherEqual(hiddenDropout, 0)
-  
+
   checkIsClass(sizeEmbedding, c("integer", "numeric"))
   checkHigherEqual(sizeEmbedding, 1)
-  
+
   checkIsClass(hyperParamSearch, "character")
-  
+
   checkIsClass(randomSample, c("numeric", "integer"))
   checkHigherEqual(randomSample, 1)
-  
+
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
-  
-  paramGrid <- list(
-    numLayers = numLayers,
-    sizeHidden = sizeHidden,
-    hiddenFactor = hiddenFactor,
-    residualDropout = residualDropout,
-    hiddenDropout = hiddenDropout,
-    sizeEmbedding = sizeEmbedding)
-  
+
+  paramGrid <- list(numLayers = numLayers,
+                    sizeHidden = sizeHidden,
+                    hiddenFactor = hiddenFactor,
+                    residualDropout = residualDropout,
+                    hiddenDropout = hiddenDropout,
+                    sizeEmbedding = sizeEmbedding)
 
   paramGrid <- c(paramGrid, estimatorSettings$paramsToTune)
-  
+
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
-  if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
-               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
-               "\n Please lower the amount of randomSamples"))
+  if (hyperParamSearch == "random" && randomSample > length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of 
+               possible hyperparameter combinations.", "\n randomSample:",
+               randomSample, "\n Possible hyperparameter combinations:",
+               length(param), "\n Please lower the amount of randomSamples"))
   }
-  
+
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed,
+                                      {param <- param[sample(length(param),
+                                                             randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "ResNet"
+  attr(param, "settings")$modelType <- "ResNet"
   results <- list(
     fitFunction = "fitEstimator",
     param = param,
diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R
index be626d5..fe7dfc7 100644
--- a/R/TrainingCache-class.R
+++ b/R/TrainingCache-class.R
@@ -2,9 +2,8 @@
 #' @description
 #' Parameter caching for training persistence and continuity
 #' @export
-TrainingCache <- R6::R6Class(
+trainingCache <- R6::R6Class(
   "TrainingCache",
-  
   private = list(
     .paramPersistence = list(
       gridSearchPredictions = NULL,
@@ -12,30 +11,27 @@ TrainingCache <- R6::R6Class(
     ),
     .paramContinuity = list(),
     .saveDir = NULL,
-    
     writeToFile = function() {
       saveRDS(private$.paramPersistence, file.path(private$.saveDir))
     },
-    
     readFromFile = function() {
-        private$.paramPersistence <- readRDS(file.path(private$.saveDir))
+      private$.paramPersistence <- readRDS(file.path(private$.saveDir))
     }
   ),
-  
   public = list(
     #' @description
     #' Creates a new training cache
     #' @param inDir Path to the analysis directory
     initialize = function(inDir) {
       private$.saveDir <- file.path(inDir, "paramPersistence.rds")
-      
+
       if (file.exists(private$.saveDir)) {
         private$readFromFile()
       } else {
         private$writeToFile()
       }
     },
-    
+
     #' @description
     #' Checks whether the parameter grid in the model settings is identical to
     #' the cached parameters.
@@ -44,7 +40,7 @@ TrainingCache <- R6::R6Class(
     isParamGridIdentical = function(inModelParams) {
       return(identical(inModelParams, private$.paramPersistence$modelParams))
     },
-    
+
     #' @description
     #' Saves the grid search results to the training cache
     #' @param inGridSearchPredictions Grid search predictions
@@ -53,7 +49,7 @@ TrainingCache <- R6::R6Class(
         inGridSearchPredictions
       private$writeToFile()
     },
-    
+
     #' @description
     #' Saves the parameter grid to the training cache
     #' @param inModelParams Parameter grid from the model settings
@@ -61,20 +57,21 @@ TrainingCache <- R6::R6Class(
       private$.paramPersistence$modelParams <- inModelParams
       private$writeToFile()
     },
-    
+
     #' @description
     #' Gets the grid search results from the training cache
     #' @returns Grid search results from the training cache
     getGridSearchPredictions = function() {
       return(private$.paramPersistence$gridSearchPredictions)
     },
-    
+
     #' @description
     #' Check if cache is full
     #' @returns Boolen
     isFull = function() {
-      return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance)))))
-    },    
+      return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions,
+                               function(x) !is.null(x$gridPerformance)))))
+    },
     
     #' @description
     #' Gets the last index from the cached grid search
@@ -88,11 +85,11 @@ TrainingCache <- R6::R6Class(
           return(1)
         } else {
           return(which(sapply(private$.paramPersistence$gridSearchPredictions,
-            is.null))[1])
+                              is.null))[1])
         }
       }
     },
-    
+
     #' @description
     #' Remove the training cache from the analysis path
     dropCache = function() {
diff --git a/R/Transformer.R b/R/Transformer.R
index 8b0d0c4..cd4a968 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -24,13 +24,13 @@
 #' @param estimatorSettings created with `setEstimator`
 #'
 #' @export
-setDefaultTransformer <- function(estimatorSettings=setEstimator(
-  learningRate = 'auto',
-  weightDecay = 1e-4,
-  batchSize=512,
-  epochs=10,
-  seed=NULL,
-  device='cpu')
+setDefaultTransformer <- function(estimatorSettings =
+    setEstimator(learningRate = "auto",
+                 weightDecay = 1e-4,
+                 batchSize = 512,
+                 epochs = 10,
+                 seed = NULL,
+                 device = "cpu")
 ) {
   transformerSettings <- setTransformer(numBlocks = 3,
                                         dimToken = 192,
@@ -40,10 +40,10 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator(
                                         ffnDropout = 0.1,
                                         resDropout = 0.0,
                                         dimHidden = 256,
-                                        estimatorSettings=estimatorSettings,
-                                        hyperParamSearch = 'random',
+                                        estimatorSettings = estimatorSettings,
+                                        hyperParamSearch = "random",
                                         randomSample = 1)
-  attr(transformerSettings, 'settings')$name <- 'defaultTransformer'
+  attr(transformerSettings, "settings")$name <- "defaultTransformer"
   return(transformerSettings)
 }
 
@@ -54,75 +54,81 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator(
 #'
 #' @param numBlocks               number of transformer blocks
 #' @param dimToken                dimension of each token (embedding size)
-#' @param dimOut                  dimension of output, usually 1 for binary problems
+#' @param dimOut                  dimension of output, usually 1 for binary
+#' problems
 #' @param numHeads                number of attention heads
 #' @param attDropout              dropout to use on attentions
 #' @param ffnDropout              dropout to use in feedforward block
 #' @param resDropout              dropout to use in residual connections
 #' @param dimHidden               dimension of the feedworward block
-#' @param dimHiddenRatio          dimension of the feedforward block as a ratio of dimToken (embedding size)
+#' @param dimHiddenRatio          dimension of the feedforward block as a ratio
+#' of dimToken (embedding size)
 #' @param estimatorSettings       created with `setEstimator`
-#' @param hyperParamSearch        what kind of hyperparameter search to do, default 'random'
-#' @param randomSample            How many samples to use in hyperparameter search if random
-#' @param randomSampleSeed        Random seed to sample hyperparameter combinations
+#' @param hyperParamSearch        what kind of hyperparameter search to do,
+#' default 'random'
+#' @param randomSample            How many samples to use in hyperparameter
+#' search if random
+#' @param randomSampleSeed        Random seed to sample hyperparameter
+#' combinations
 #'
 #' @export
-setTransformer <- function(numBlocks = 3, 
-                           dimToken = 96, 
+setTransformer <- function(numBlocks = 3,
+                           dimToken = 96,
                            dimOut = 1,
-                           numHeads = 8, 
-                           attDropout = 0.25, 
+                           numHeads = 8,
+                           attDropout = 0.25,
                            ffnDropout = 0.25,
-                           resDropout = 0, 
-                           dimHidden = 512, 
+                           resDropout = 0,
+                           dimHidden = 512,
                            dimHiddenRatio = NULL,
-                           estimatorSettings=setEstimator(weightDecay = 1e-6,
-                                                          batchSize=1024,
-                                                          epochs=10,
-                                                          seed=NULL),
+                           estimatorSettings = setEstimator(weightDecay = 1e-6,
+                                                            batchSize = 1024,
+                                                            epochs = 10,
+                                                            seed = NULL),
                            hyperParamSearch = "random",
-                           randomSample = 1, 
+                           randomSample = 1,
                            randomSampleSeed = NULL) {
-  
+
   checkIsClass(numBlocks, c("integer", "numeric"))
   checkHigherEqual(numBlocks, 1)
-  
+
   checkIsClass(dimToken, c("integer", "numeric"))
   checkHigherEqual(dimToken, 1)
-  
+
   checkIsClass(dimOut, c("integer", "numeric"))
   checkHigherEqual(dimOut, 1)
-  
+
   checkIsClass(numHeads, c("integer", "numeric"))
   checkHigherEqual(numHeads, 1)
 
   checkIsClass(attDropout, c("numeric"))
   checkHigherEqual(attDropout, 0)
-  
+
   checkIsClass(ffnDropout, c("numeric"))
   checkHigherEqual(ffnDropout, 0)
-  
+
   checkIsClass(resDropout, c("numeric"))
   checkHigherEqual(resDropout, 0)
-  
+
   checkIsClass(dimHidden, c("integer", "numeric", "NULL"))
   if (!is.null(dimHidden)) {
     checkHigherEqual(dimHidden, 1)
   }
-  
+
   checkIsClass(dimHiddenRatio, c("numeric", "NULL"))
   if (!is.null(dimHiddenRatio)) {
     checkHigher(dimHiddenRatio, 0)
   }
-  
+
   checkIsClass(hyperParamSearch, "character")
-  
+
   checkIsClass(randomSample, c("numeric", "integer"))
   checkHigherEqual(randomSample, 1)
-  
+
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
-  
-  if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads), dimToken %% numHeads != 0))) {
+
+  if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads),
+               dimToken %% numHeads != 0))) {
     stop(paste(
       "dimToken needs to divisible by numHeads. dimToken =", dimToken,
       "is not divisible by numHeads =", numHeads
@@ -157,21 +163,24 @@ setTransformer <- function(numBlocks = 3,
 
   if (!is.null(dimHiddenRatio)) {
     param <- lapply(param, function(x) {
-      x$dimHidden <- round(x$dimToken*x$dimHidden, digits = 0)
+      x$dimHidden <- round(x$dimToken * x$dimHidden, digits = 0)
       return(x)
     })
   }
-  
-  if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.",
-               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
-               "\n Please lower the amount of randomSample"))
+
+  if (hyperParamSearch == "random" && randomSample > length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of
+               possible hyperparameter combinations.", "\n randomSample:",
+               randomSample, "\n Possible hyperparameter combinations:",
+               length(param), "\n Please lower the amount of randomSample"))
   }
 
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed,
+                                      {param <- param[sample(length(param),
+                                                             randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "Transformer"
+  attr(param, "settings")$modelType <- "Transformer"
   results <- list(
     fitFunction = "fitEstimator",
     param = param,
@@ -186,4 +195,4 @@ setTransformer <- function(numBlocks = 3,
 
   class(results) <- "modelSettings"
   return(results)
-}
\ No newline at end of file
+}
diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py
index 98d9ed3..b488520 100644
--- a/inst/python/Dataset.py
+++ b/inst/python/Dataset.py
@@ -8,29 +8,32 @@
 
 
 class Data(Dataset):
-    def __init__(self,
-                 data,
-                 labels=None,
-                 numerical_features=None):
+    def __init__(self, data, labels=None, numerical_features=None):
         """
         data: path to a covariates dataframe either arrow dataset or sqlite object
         labels: a list of either 0 or 1, 1 if the patient got the outcome
         numerical_features: list of indices where the numerical features are
         """
         start = time.time()
-        if pathlib.Path(data).suffix == '.sqlite':
+        if pathlib.Path(data).suffix == ".sqlite":
             data = urllib.parse.quote(data)
-            data = pl.read_database("SELECT * from covariates",
-                                    connection=f"sqlite://{data}").lazy()
+            data = pl.read_database(
+                "SELECT * from covariates", connection=f"sqlite://{data}"
+            ).lazy()
         else:
-            data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow'))
-        observations = data.select(pl.col('rowId').max()).collect()[0, 0]
+            data = pl.scan_ipc(pathlib.Path(data).joinpath("covariates/*.arrow"))
+        observations = data.select(pl.col("rowId").max()).collect()[0, 0]
         # detect features are numeric
         if numerical_features is None:
-            self.numerical_features = data.groupby(by='columnId') \
-                .n_unique().filter(pl.col('covariateValue') > 1).select('columnId').collect()['columnId']
+            self.numerical_features = (
+                data.groupby(by="columnId")
+                .n_unique()
+                .filter(pl.col("covariateValue") > 1)
+                .select("columnId")
+                .collect()["columnId"]
+            )
         else:
-            self.numerical_features = pl.Series('num', numerical_features)
+            self.numerical_features = pl.Series("num", numerical_features)
 
         if labels:
             self.target = torch.as_tensor(labels)
@@ -42,42 +45,61 @@ def __init__(self,
         # create newColumnId from 1 (or zero?) until # catColumns
         # select rowId and newColumnId
         # rename newColumnId to columnId and sort by it
-        data_cat = data.filter(~pl.col('columnId')
-                               .is_in(self.numerical_features)) \
-            .sort(by='columnId').with_row_count('newColumnId'). \
-            with_columns(pl.col('newColumnId').first().over('columnId')
-                         .rank(method="dense")) \
-            .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId')).sort('rowId') \
-            .with_columns(pl.col('rowId') - 1).collect()
+        data_cat = (
+            data.filter(~pl.col("columnId").is_in(self.numerical_features))
+            .select(pl.col("rowId"), pl.col("columnId"))
+            .sort(["rowId", "columnId"])
+            .with_columns(pl.col("rowId") - 1)
+            .collect()
+        )
         cat_tensor = torch.as_tensor(data_cat.to_numpy())
-        tensor_list = torch.split(cat_tensor[:, 1], torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].
-                                  tolist())
+        tensor_list = torch.split(
+            cat_tensor[:, 1],
+            torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].tolist(),
+        )
 
         # because of subjects without cat features, I need to create a list with all zeroes and then insert
         # my tensorList. That way I can still index the dataset correctly.
         total_list = [torch.as_tensor((0,))] * observations
-        idx = data_cat['rowId'].unique().to_list()
+        idx = data_cat["rowId"].unique().to_list()
         for i, i2 in enumerate(idx):
             total_list[i2] = tensor_list[i]
         self.cat = torch.nn.utils.rnn.pad_sequence(total_list, batch_first=True)
-        self.cat_features = data_cat['columnId'].unique()
+        self.cat_features = data_cat["columnId"].unique()
 
         # numerical data,
         # N x C, dense matrix with values for N patients/visits for C numerical features
         if pl.count(self.numerical_features) == 0:
             self.num = None
         else:
-            numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \
-                with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId').
-                                                           rank(method="dense") - 1, pl.col('rowId') - 1) \
-                .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect()
-            indices = torch.as_tensor(numerical_data.select(['rowId', 'columnId']).to_numpy(), dtype=torch.long)
-            values = torch.as_tensor(numerical_data.select('covariateValue').to_numpy(), dtype=torch.float)
-            self.num = torch.sparse_coo_tensor(indices=indices.T,
-                                               values=values.squeeze(),
-                                               size=(observations, pl.count(self.numerical_features))).to_dense()
+            map_numerical = dict(zip(self.numerical_features.sort().to_list(),
+                                     list(range(len(self.numerical_features)))))
+
+            numerical_data = (
+                data.filter(pl.col("columnId").is_in(self.numerical_features))
+                .with_columns(pl.col("columnId").replace(map_numerical),
+                              pl.col("rowId") - 1)
+                .select(
+                    pl.col("rowId"),
+                    pl.col("columnId"),
+                    pl.col("covariateValue"),
+                )
+                .collect()
+            )
+            indices = torch.as_tensor(
+                numerical_data.select(["rowId", "columnId"]).to_numpy(),
+                dtype=torch.long,
+            )
+            values = torch.as_tensor(
+                numerical_data.select("covariateValue").to_numpy(), dtype=torch.float
+            )
+            self.num = torch.sparse_coo_tensor(
+                indices=indices.T,
+                values=values.squeeze(),
+                size=(observations, pl.count(self.numerical_features)),
+            ).to_dense()
         delta = time.time() - start
-        print(f'Processed data in {delta:.2f} seconds')
+        print(f"Processed data in {delta:.2f} seconds")
 
     def get_numerical_features(self):
         return self.numerical_features
@@ -90,13 +112,15 @@ def __len__(self):
 
     def __getitem__(self, item):
         if self.num is not None:
-            batch = {"cat": self.cat[item, :],
-                     "num": self.num[item, :]}
+            batch = {"cat": self.cat[item, :], "num": self.num[item, :]}
         else:
-            batch = {"cat": self.cat[item, :].squeeze(),
-                     "num": None}
+            batch = {"cat": self.cat[item, :].squeeze(), "num": None}
         if batch["cat"].dim() == 1 and not isinstance(item, list):
             batch["cat"] = batch["cat"].unsqueeze(0)
-        if batch["num"] is not None and batch["num"].dim() == 1 and not isinstance(item, list):
+        if (
+            batch["num"] is not None
+            and batch["num"].dim() == 1
+            and not isinstance(item, list)
+        ):
             batch["num"] = batch["num"].unsqueeze(0)
         return [batch, self.target[item].squeeze()]
diff --git a/inst/python/Estimator.py b/inst/python/Estimator.py
index f8a182f..983bde6 100644
--- a/inst/python/Estimator.py
+++ b/inst/python/Estimator.py
@@ -5,7 +5,6 @@
 from torch.utils.data import DataLoader, BatchSampler, RandomSampler, SequentialSampler
 import torch.nn.functional as F
 from tqdm import tqdm
-from sklearn.metrics import roc_auc_score
 
 
 class Estimator:
@@ -13,10 +12,7 @@ class Estimator:
     A class that wraps around pytorch models.
     """
 
-    def __init__(self,
-                 model,
-                 model_parameters,
-                 estimator_settings):
+    def __init__(self, model, model_parameters, estimator_settings):
         self.seed = estimator_settings["seed"]
         if callable(estimator_settings["device"]):
             self.device = estimator_settings["device"]()
@@ -37,31 +33,51 @@ def __init__(self,
         self.previous_epochs = int(estimator_settings.get("previous_epochs", 0))
         self.model.to(device=self.device)
 
-        self.optimizer = estimator_settings["optimizer"](params=self.model.parameters(),
-                                                         lr=self.learning_rate,
-                                                         weight_decay=self.weight_decay)
+        self.optimizer = estimator_settings["optimizer"](
+            params=self.model.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay,
+        )
         self.criterion = estimator_settings["criterion"]()
 
-        if "metric" in estimator_settings.keys() and estimator_settings["metric"] is not None:
+        if (
+            "metric" in estimator_settings.keys()
+            and estimator_settings["metric"] is not None
+        ):
             self.metric = estimator_settings["metric"]
-            if type(self.metric) == str:
+            if isinstance(self.metric, str):
                 if self.metric == "auc":
-                    self.metric = {"name": "auc",
-                                   "mode": "max"}
+                    self.metric = {"name": "auc", "mode": "max"}
                 elif self.metric == "loss":
-                    self.metric = {"name": "loss",
-                                   "mode": "min"}
-            if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None:
+                    self.metric = {"name": "loss", "mode": "min"}
+            if (
+                "scheduler" in estimator_settings.keys()
+                and estimator_settings["scheduler"] is not None
+            ):
                 estimator_settings["scheduler"]["params"]["mode"] = self.metric["mode"]
-            if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None:
-                estimator_settings["early_stopping"]["params"]["mode"] = self.metric["mode"]
-
-        if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None:
-            self.scheduler = estimator_settings["scheduler"]["fun"](self.optimizer,
-                                                                    **estimator_settings["scheduler"]["params"])
-
-        if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None:
-            self.early_stopper = EarlyStopping(**estimator_settings["early_stopping"]["params"])
+            if (
+                "early_stopping" in estimator_settings.keys()
+                and estimator_settings["early_stopping"] is not None
+            ):
+                estimator_settings["early_stopping"]["params"]["mode"] = self.metric[
+                    "mode"
+                ]
+
+        if (
+            "scheduler" in estimator_settings.keys()
+            and estimator_settings["scheduler"] is not None
+        ):
+            self.scheduler = estimator_settings["scheduler"]["fun"](
+                self.optimizer, **estimator_settings["scheduler"]["params"]
+            )
+
+        if (
+            "early_stopping" in estimator_settings.keys()
+            and estimator_settings["early_stopping"] is not None
+        ):
+            self.early_stopper = EarlyStopping(
+                **estimator_settings["early_stopping"]["params"]
+            )
         else:
             self.early_stopper = None
 
@@ -70,21 +86,24 @@ def __init__(self,
         self.learn_rate_schedule = None
 
     def fit(self, dataset, test_dataset):
-
-        train_dataloader = DataLoader(dataset=dataset,
-                                      batch_size=None,
-                                      sampler=BatchSampler(
-                                          sampler=RandomSampler(dataset),
-                                          batch_size=self.batch_size,
-                                          drop_last=True
-                                      ))
-        test_dataloader = DataLoader(dataset=test_dataset,
-                                     batch_size=None,
-                                     sampler=BatchSampler(
-                                         sampler=SequentialSampler(test_dataset),
-                                         batch_size=self.batch_size,
-                                         drop_last=False
-                                     ))
+        train_dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=RandomSampler(dataset),
+                batch_size=self.batch_size,
+                drop_last=True,
+            ),
+        )
+        test_dataloader = DataLoader(
+            dataset=test_dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=SequentialSampler(test_dataset),
+                batch_size=self.batch_size,
+                drop_last=False,
+            ),
+        )
 
         trained_epochs = dict()
         times = list()
@@ -106,19 +125,25 @@ def fit(self, dataset, test_dataset):
             times.append(round(delta_time, 3))
 
             if self.early_stopper:
-                self.early_stopper(scores['metric'])
+                self.early_stopper(scores["metric"])
                 if self.early_stopper.improved:
                     model_state_dict[epoch] = self.model.state_dict()
                     trained_epochs[epoch] = current_epoch
                 if self.early_stopper.early_stop:
                     print("Early stopping, validation metric stopped improving")
-                    print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds')
-                    self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates)
+                    print(
+                        f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds"
+                    )
+                    self.finish_fit(
+                        all_scores, model_state_dict, trained_epochs, learning_rates
+                    )
                     return
             else:
                 model_state_dict[epoch] = self.model.state_dict()
                 trained_epochs[epoch] = current_epoch
-        print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds')
+        print(
+            f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds"
+        )
         self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates)
         return
 
@@ -155,8 +180,7 @@ def score(self, dataloader):
             mean_loss = loss.mean().item()
             predictions = torch.concat(predictions)
             targets = torch.concat(targets)
-            auc = roc_auc_score(targets.cpu(), predictions.cpu())
-            # auc = compute_auc(predictions, targets)
+            auc = compute_auc(targets.cpu(), predictions.cpu())
             scores = dict()
             if self.metric:
                 if self.metric["name"] == "auc":
@@ -172,48 +196,68 @@ def score(self, dataloader):
 
     def finish_fit(self, scores, model_state_dict, epoch, learning_rates):
         if self.metric["mode"] == "max":
-            best_epoch_index = torch.argmax(torch.as_tensor([x["metric"] for x in scores])).item()
+            best_epoch_index = torch.argmax(
+                torch.as_tensor([x["metric"] for x in scores])
+            ).item()
         elif self.metric["mode"] == "min":
-            best_epoch_index = torch.argmin(torch.as_tensor([x["metric"] for x in scores])).item()
+            best_epoch_index = torch.argmin(
+                torch.as_tensor([x["metric"] for x in scores])
+            ).item()
 
         best_model_state_dict = model_state_dict[best_epoch_index]
         self.model.load_state_dict(best_model_state_dict)
 
         self.best_epoch = epoch[best_epoch_index]
-        self.best_score = {"loss": scores[best_epoch_index]["loss"],
-                           "auc": scores[best_epoch_index]["auc"]}
-        self.learn_rate_schedule = learning_rates[:(best_epoch_index+1)]
+        self.best_score = {
+            "loss": scores[best_epoch_index]["loss"],
+            "auc": scores[best_epoch_index]["auc"],
+        }
+        self.learn_rate_schedule = learning_rates[: (best_epoch_index + 1)]
         print(f"Loaded best model (based on AUC) from epoch {self.best_epoch}")
         print(f"ValLoss: {self.best_score['loss']}")
         print(f"valAUC: {self.best_score['auc']}")
-        if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss":
+        if (
+            self.metric
+            and self.metric["name"] != "auc"
+            and self.metric["name"] != "loss"
+        ):
             self.best_score[self.metric["name"]] = scores[best_epoch_index]["metric"]
             print(f"{self.metric['name']}: {self.best_score[self.metric['name']]}")
         return
 
     def print_progress(self, scores, training_loss, delta_time, current_epoch):
-        if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss":
-            print(f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} "
-                  f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} "
-                  f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds "
-                  f"| LR: {self.optimizer.param_groups[0]['lr']}")
+        if (
+            self.metric
+            and self.metric["name"] != "auc"
+            and self.metric["name"] != "loss"
+        ):
+            print(
+                f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} "
+                f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} "
+                f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds "
+                f"| LR: {self.optimizer.param_groups[0]['lr']}"
+            )
         else:
-            print(f"Epochs: {current_epoch} "
-                  f"| Val AUC: {scores['auc']:.3f} "
-                  f"| Val Loss: {scores['loss']:.3f} "
-                  f"| Train Loss: {training_loss:.3f} "
-                  f"| Time: {delta_time:.3f} seconds "
-                  f"| LR: {self.optimizer.param_groups[0]['lr']}")
+            print(
+                f"Epochs: {current_epoch} "
+                f"| Val AUC: {scores['auc']:.3f} "
+                f"| Val Loss: {scores['loss']:.3f} "
+                f"| Train Loss: {training_loss:.3f} "
+                f"| Time: {delta_time:.3f} seconds "
+                f"| LR: {self.optimizer.param_groups[0]['lr']}"
+            )
         return
 
     def fit_whole_training_set(self, dataset, learning_rates=None):
-        dataloader = DataLoader(dataset=dataset,
-                                batch_size=None,
-                                sampler=BatchSampler(
-                                    sampler=RandomSampler(dataset),
-                                    batch_size=self.batch_size,
-                                    drop_last=True
-                                ))
+        dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=RandomSampler(dataset),
+                batch_size=self.batch_size,
+                drop_last=True,
+            ),
+        )
         if isinstance(learning_rates, list):
             self.best_epoch = len(learning_rates)
         elif ~isinstance(learning_rates, list):
@@ -223,7 +267,7 @@ def fit_whole_training_set(self, dataset, learning_rates=None):
             self.best_epoch = self.epochs
 
         for epoch in range(self.best_epoch):
-            self.optimizer.param_groups[0]['lr'] = learning_rates[epoch]
+            self.optimizer.param_groups[0]["lr"] = learning_rates[epoch]
             self.fit_epoch(dataloader)
         return
 
@@ -233,19 +277,21 @@ def save(self, path, name):
             model_state_dict=self.model.state_dict(),
             model_parameters=self.model_parameters,
             estimator_settings=self.estimator_settings,
-            epoch=self.epochs)
-        torch.save(out,
-                   f=save_path)
+            epoch=self.epochs,
+        )
+        torch.save(out, f=save_path)
         return save_path
 
     def predict_proba(self, dataset):
-        dataloader = DataLoader(dataset=dataset,
-                                batch_size=None,
-                                sampler=BatchSampler(
-                                    sampler=SequentialSampler(dataset),
-                                    batch_size=self.batch_size,
-                                    drop_last=False
-                                ))
+        dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=SequentialSampler(dataset),
+                batch_size=self.batch_size,
+                drop_last=False,
+            ),
+        )
         with torch.no_grad():
             predictions = list()
             self.model.eval()
@@ -263,15 +309,11 @@ def predict(self, dataset, threshold=None):
             # use outcome rate
             threshold = dataset.target.sum().item() / len(dataset)
         predicted_class = predictions > threshold
+        return predicted_class
 
 
 class EarlyStopping:
-
-    def __init__(self,
-                 patience=3,
-                 delta=0,
-                 verbose=True,
-                 mode='max'):
+    def __init__(self, patience=3, delta=0, verbose=True, mode="max"):
         self.patience = patience
         self.counter = 0
         self.verbose = verbose
@@ -282,9 +324,8 @@ def __init__(self,
         self.previous_score = 0
         self.mode = mode
 
-    def __call__(self,
-                 metric):
-        if self.mode == 'max':
+    def __call__(self, metric):
+        if self.mode == "max":
             score = metric
         else:
             score = -1 * metric
@@ -295,8 +336,9 @@ def __call__(self,
             self.counter += 1
             self.improved = False
             if self.verbose:
-                print(f"Early stopping counter: {self.counter}"
-                      f" out of {self.patience}")
+                print(
+                    f"Early stopping counter: {self.counter}" f" out of {self.patience}"
+                )
             if self.counter >= self.patience:
                 self.early_stop = True
         else:
@@ -306,12 +348,12 @@ def __call__(self,
         self.previous_score = score
 
 
-def batch_to_device(batch, device='cpu'):
+def batch_to_device(batch, device="cpu"):
     if torch.is_tensor(batch):
         batch = batch.to(device=device)
     else:
         for ix, b in enumerate(batch):
-            if type(b) is str:
+            if isinstance(b, str):
                 key = b
                 b = batch[b]
             else:
@@ -330,26 +372,26 @@ def batch_to_device(batch, device='cpu'):
     return batch
 
 
-def compute_auc(input, target, n_threshold=1000):
-    threshold = torch.linspace(0, 1.0, n_threshold).to(device=input.device)
-    pred_label = input >= threshold[:, None, None]
-    input_target = pred_label * target
+def compute_auc(y_true, y_pred):
+    """
+    Computes the AUC score for binary classification predictions with a fast algorithm.
+    Args:
+    y_true (torch.Tensor): True binary labels.
+    y_pred (torch.Tensor): Predicted scores.
+    Returns:
+    float: Computed AUC score.
+    """
+    # Ensure inputs are sorted by predicted score
+    _, sorted_indices = torch.sort(y_pred, descending=True)
+    y_true_sorted = y_true[sorted_indices]
 
-    cum_tp = F.pad(input_target.sum(dim=-1).rot90(1, [1, 0]), (1, 0), value=0.0)
-    cum_fp = F.pad(
-        (pred_label.sum(dim=-1) - input_target.sum(dim=-1)).rot90(1, [1, 0]),
-        (1, 0),
-        value=0.0,
-    )
+    # Get the number of positive and negative examples
+    n_pos = y_true_sorted.sum()
+    n_neg = (1 - y_true_sorted).sum()
 
-    if len(cum_tp.shape) > 1:
-        factor = cum_tp[:, -1] * cum_fp[:, -1]
-    else:
-        factor = cum_tp[-1] * cum_fp[-1]
-    # Set AUROC to 0.5 when the target contains all ones or all zeros.
-    auroc = torch.where(
-        factor == 0,
-        0.5,
-        torch.trapz(cum_tp, cum_fp).double() / factor,
-    )
-    return auroc.item()
+    # for every negative label, count preceding positive labels in sorted labels
+    num_crossings = torch.cumsum(y_true_sorted, 0)[y_true_sorted == 0].sum()
+
+    # Compute AUC
+    auc = num_crossings / (n_pos * n_neg)
+    return auc
diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py
index 9d5bd0c..e7141cd 100644
--- a/inst/python/LrFinder.py
+++ b/inst/python/LrFinder.py
@@ -2,17 +2,13 @@
 
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader, BatchSampler, RandomSampler
 from tqdm import tqdm
 
 from Estimator import batch_to_device
 
 
 class ExponentialSchedulerPerBatch(_LRScheduler):
-
-    def __init__(self, optimizer,
-                 end_lr,
-                 num_iter):
+    def __init__(self, optimizer, end_lr, num_iter):
         self.end_lr = end_lr
         self.num_iter = num_iter
         super(ExponentialSchedulerPerBatch, self).__init__(optimizer, last_epoch=-1)
@@ -23,14 +19,9 @@ def get_lr(self):
 
 
 class LrFinder:
-
-    def __init__(self,
-                 model,
-                 model_parameters,
-                 estimator_settings,
-                 lr_settings):
+    def __init__(self, model, model_parameters, estimator_settings, lr_settings):
         if lr_settings is None:
-          lr_settings = {}
+            lr_settings = {}
         min_lr = lr_settings.get("min_lr", 1e-7)
         max_lr = lr_settings.get("max_lr", 1)
         num_lr = lr_settings.get("num_lr", 100)
@@ -50,13 +41,16 @@ def __init__(self,
         self.smooth = smooth
         self.divergence_threshold = divergence_threshold
 
-        self.optimizer = estimator_settings['optimizer'](params=self.model.parameters(),
-                                                         lr=self.min_lr)
+        self.optimizer = estimator_settings["optimizer"](
+            params=self.model.parameters(), lr=self.min_lr
+        )
 
-        self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr)
+        self.scheduler = ExponentialSchedulerPerBatch(
+            self.optimizer, self.max_lr, self.num_lr
+        )
 
         self.criterion = estimator_settings["criterion"]()
-        self.batch_size = int(estimator_settings['batch_size'])
+        self.batch_size = int(estimator_settings["batch_size"])
         self.losses = None
         self.loss_index = None
 
@@ -74,7 +68,9 @@ def get_lr(self, dataset):
             out = self.model(batch[0])
             loss = self.criterion(out, batch[1])
             if self.smooth is not None and i != 0:
-                losses[i] = self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1]
+                losses[i] = (
+                    self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1]
+                )
             else:
                 losses[i] = loss.item()
             lrs[i] = self.optimizer.param_groups[0]["lr"]
@@ -90,13 +86,15 @@ def get_lr(self, dataset):
                     best_loss = losses[i]
 
             if losses[i] > (self.divergence_threshold * best_loss):
-                print(f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}")
+                print(
+                    f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}"
+                )
                 break
 
         # find LR where gradient is highest but before global minimum is reached
         # I added -5 to make sure it is not still in the minimum
         global_minimum = torch.argmin(losses)
-        gradient = torch.diff(losses[:(global_minimum - 5)+1])
+        gradient = torch.diff(losses[: (global_minimum - 5) + 1])
         smallest_gradient = torch.argmin(gradient)
 
         suggested_lr = lrs[smallest_gradient]
diff --git a/inst/python/MLP.py b/inst/python/MLP.py
index 3b8b03b..cd049a6 100644
--- a/inst/python/MLP.py
+++ b/inst/python/MLP.py
@@ -4,17 +4,18 @@
 
 
 class MLP(nn.Module):
-
-    def __init__(self,
-                 cat_features: int,
-                 num_features: int,
-                 size_embedding: int,
-                 size_hidden: int,
-                 num_layers: int,
-                 activation=nn.ReLU,
-                 normalization=nn.BatchNorm1d,
-                 dropout=None,
-                 d_out: int = 1):
+    def __init__(
+        self,
+        cat_features: int,
+        num_features: int,
+        size_embedding: int,
+        size_hidden: int,
+        num_layers: int,
+        activation=nn.ReLU,
+        normalization=nn.BatchNorm1d,
+        dropout=None,
+        d_out: int = 1,
+    ):
         super(MLP, self).__init__()
         self.name = "MLP"
         cat_features = int(cat_features)
@@ -23,21 +24,25 @@ def __init__(self,
         size_hidden = int(size_hidden)
         num_layers = int(num_layers)
         d_out = int(d_out)
-        
-        self.embedding = nn.EmbeddingBag(cat_features + 1,
-                                         size_embedding,
-                                         padding_idx=0)
+
+        self.embedding = nn.EmbeddingBag(
+            cat_features + 1, size_embedding, padding_idx=0
+        )
 
         if num_features != 0 and num_features is not None:
             self.num_embedding = NumericalEmbedding(num_features, size_embedding)
 
         self.first_layer = nn.Linear(size_embedding, size_hidden)
 
-        self.layers = nn.ModuleList(MLPLayer(size_hidden=size_hidden,
-                                             normalization=normalization,
-                                             activation=activation,
-                                             dropout=dropout)
-                                    for _ in range(num_layers))
+        self.layers = nn.ModuleList(
+            MLPLayer(
+                size_hidden=size_hidden,
+                normalization=normalization,
+                activation=activation,
+                dropout=dropout,
+            )
+            for _ in range(num_layers)
+        )
         self.last_norm = normalization(size_hidden)
         self.head = nn.Linear(size_hidden, d_out)
 
@@ -62,12 +67,14 @@ def forward(self, input):
 
 
 class MLPLayer(nn.Module):
-    def __init__(self,
-                 size_hidden=64,
-                 normalization=nn.BatchNorm1d,
-                 activation=nn.ReLU,
-                 dropout=0.0,
-                 bias=True):
+    def __init__(
+        self,
+        size_hidden=64,
+        normalization=nn.BatchNorm1d,
+        activation=nn.ReLU,
+        dropout=0.0,
+        bias=True,
+    ):
         super(MLPLayer, self).__init__()
         self.norm = normalization(size_hidden)
         self.activation = activation()
diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py
index cef4b49..9df35d8 100644
--- a/inst/python/ResNet.py
+++ b/inst/python/ResNet.py
@@ -5,22 +5,23 @@
 
 
 class ResNet(nn.Module):
-
-    def __init__(self,
-                 cat_features: int,
-                 num_features: int = 0,
-                 size_embedding: int = 256,
-                 size_hidden: int = 256,
-                 num_layers: int = 2,
-                 hidden_factor: int = 1,
-                 activation=nn.ReLU,
-                 normalization=nn.BatchNorm1d,
-                 hidden_dropout=0,
-                 residual_dropout=0,
-                 dim_out: int = 1,
-                 concat_num=True):
+    def __init__(
+        self,
+        cat_features: int,
+        num_features: int = 0,
+        size_embedding: int = 256,
+        size_hidden: int = 256,
+        num_layers: int = 2,
+        hidden_factor: int = 1,
+        activation=nn.ReLU,
+        normalization=nn.BatchNorm1d,
+        hidden_dropout=0,
+        residual_dropout=0,
+        dim_out: int = 1,
+        concat_num=True,
+    ):
         super(ResNet, self).__init__()
-        self.name = 'ResNet'
+        self.name = "ResNet"
         cat_features = int(cat_features)
         num_features = int(num_features)
         size_embedding = int(size_embedding)
@@ -28,12 +29,9 @@ def __init__(self,
         num_layers = int(num_layers)
         hidden_factor = int(hidden_factor)
         dim_out = int(dim_out)
-        
-        
+
         self.embedding = nn.EmbeddingBag(
-            num_embeddings=cat_features + 1,
-            embedding_dim=size_embedding,
-            padding_idx=0
+            num_embeddings=cat_features + 1, embedding_dim=size_embedding, padding_idx=0
         )
         if num_features != 0 and not concat_num:
             self.num_embedding = NumericalEmbedding(num_features, size_embedding)
@@ -45,9 +43,17 @@ def __init__(self,
 
         res_hidden = size_hidden * hidden_factor
 
-        self.layers = nn.ModuleList(ResLayer(size_hidden, res_hidden, normalization,
-                                             activation, hidden_dropout, residual_dropout)
-                                    for _ in range(num_layers))
+        self.layers = nn.ModuleList(
+            ResLayer(
+                size_hidden,
+                res_hidden,
+                normalization,
+                activation,
+                hidden_dropout,
+                residual_dropout,
+            )
+            for _ in range(num_layers)
+        )
 
         self.last_norm = normalization(size_hidden)
 
@@ -58,7 +64,11 @@ def __init__(self,
     def forward(self, x):
         x_cat = x["cat"]
         x_cat = self.embedding(x_cat)
-        if "num" in x.keys() and x["num"] is not None and self.num_embedding is not None:
+        if (
+            "num" in x.keys()
+            and x["num"] is not None
+            and self.num_embedding is not None
+        ):
             x_num = x["num"]
             # take the average af numerical and categorical embeddings
             x = (x_cat + self.num_embedding(x_num).mean(dim=1)) / 2
@@ -79,14 +89,15 @@ def forward(self, x):
 
 
 class ResLayer(nn.Module):
-
-    def __init__(self,
-                 size_hidden,
-                 res_hidden,
-                 normalization,
-                 activation,
-                 hidden_dropout=None,
-                 residual_dropout=None):
+    def __init__(
+        self,
+        size_hidden,
+        res_hidden,
+        normalization,
+        activation,
+        hidden_dropout=None,
+        residual_dropout=None,
+    ):
         super(ResLayer, self).__init__()
 
         self.norm = normalization(size_hidden)
@@ -114,10 +125,7 @@ def forward(self, input):
 
 
 class NumericalEmbedding(nn.Module):
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 bias=True):
+    def __init__(self, num_embeddings, embedding_dim, bias=True):
         super(NumericalEmbedding, self).__init__()
         self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim))
         if bias:
@@ -134,6 +142,3 @@ def forward(self, input):
         if self.bias is not None:
             x = x + self.bias[None]
         return x
-
-
-
diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py
index 1c95b36..ddbe929 100644
--- a/inst/python/Transformer.py
+++ b/inst/python/Transformer.py
@@ -18,23 +18,24 @@ def forward(self, x):
 
 
 class Transformer(nn.Module):
-
-    def __init__(self,
-                 cat_features: int,
-                 num_features: int,
-                 num_blocks: int,
-                 dim_token: int,
-                 num_heads: int,
-                 att_dropout,
-                 ffn_dropout,
-                 res_dropout,
-                 dim_hidden: int,
-                 dim_out: int = 1,
-                 head_activation=nn.ReLU,
-                 activation=ReGLU,
-                 ffn_norm=nn.LayerNorm,
-                 head_norm=nn.LayerNorm,
-                 att_norm=nn.LayerNorm):
+    def __init__(
+        self,
+        cat_features: int,
+        num_features: int,
+        num_blocks: int,
+        dim_token: int,
+        num_heads: int,
+        att_dropout,
+        ffn_dropout,
+        res_dropout,
+        dim_hidden: int,
+        dim_out: int = 1,
+        head_activation=nn.ReLU,
+        activation=ReGLU,
+        ffn_norm=nn.LayerNorm,
+        head_norm=nn.LayerNorm,
+        att_norm=nn.LayerNorm,
+    ):
         super(Transformer, self).__init__()
         self.name = "Transformer"
         cat_features = int(cat_features)
@@ -44,8 +45,10 @@ def __init__(self,
         num_heads = int(num_heads)
         dim_hidden = int(dim_hidden)
         dim_out = int(dim_out)
-        
-        self.categorical_embedding = nn.Embedding(cat_features + 1, dim_token, padding_idx=0)
+
+        self.categorical_embedding = nn.Embedding(
+            cat_features + 1, dim_token, padding_idx=0
+        )
 
         if num_features != 0 and num_features is not None:
             self.numerical_embedding = NumericalEmbedding(num_features, dim_token)
@@ -56,27 +59,35 @@ def __init__(self,
 
         self.layers = nn.ModuleList([])
         for layer_idx in range(num_blocks):
-            layer = nn.ModuleDict({
-                "attention": nn.MultiheadAttention(dim_token, num_heads,
-                                                   dropout=att_dropout),
-                "ffn": FeedForwardBlock(dim_token, dim_hidden,
-                                        bias_first=True,
-                                        bias_second=True,
-                                        dropout=ffn_dropout,
-                                        activation=activation),
-                "attention_res_dropout": nn.Dropout(res_dropout),
-                "ffn_res_dropout": nn.Dropout(res_dropout),
-                "ffn_norm": ffn_norm(dim_token)
-            })
+            layer = nn.ModuleDict(
+                {
+                    "attention": nn.MultiheadAttention(
+                        dim_token, num_heads, dropout=att_dropout
+                    ),
+                    "ffn": FeedForwardBlock(
+                        dim_token,
+                        dim_hidden,
+                        bias_first=True,
+                        bias_second=True,
+                        dropout=ffn_dropout,
+                        activation=activation,
+                    ),
+                    "attention_res_dropout": nn.Dropout(res_dropout),
+                    "ffn_res_dropout": nn.Dropout(res_dropout),
+                    "ffn_norm": ffn_norm(dim_token),
+                }
+            )
             if layer_idx != 0:
                 layer["attention_norm"] = att_norm(dim_token)
             self.layers.append(layer)
 
-        self.head = Head(dim_token,
-                         bias=True,
-                         activation=head_activation,
-                         normalization=head_norm,
-                         dim_out=dim_out)
+        self.head = Head(
+            dim_token,
+            bias=True,
+            activation=head_activation,
+            normalization=head_norm,
+            dim_out=dim_out,
+        )
 
     def forward(self, x):
         mask = torch.where(x["cat"] == 0, True, False)
@@ -84,31 +95,34 @@ def forward(self, x):
         if self.use_numerical:
             num = self.numerical_embedding(x["num"])
             x = torch.cat([cat, num], dim=1)
-            mask = torch.cat([mask, torch.zeros([x.shape[0],
-                                                 num.shape[1]],
-                                                device=mask.device,
-                                                dtype=mask.dtype)],
-                             dim=1)
+            mask = torch.cat(
+                [
+                    mask,
+                    torch.zeros(
+                        [x.shape[0], num.shape[1]], device=mask.device, dtype=mask.dtype
+                    ),
+                ],
+                dim=1,
+            )
         else:
             x = cat
         x = self.class_token(x)
-        mask = torch.cat([mask, torch.zeros([x.shape[0], 1],
-                                            device=mask.device,
-                                            dtype=mask.dtype)],
-                         dim=1)
+        mask = torch.cat(
+            [mask, torch.zeros([x.shape[0], 1], device=mask.device, dtype=mask.dtype)],
+            dim=1,
+        )
 
         for i, layer in enumerate(self.layers):
             x_residual = self.start_residual(layer, "attention", x)
 
-            if i == len(self.layers)-1:
+            if i == len(self.layers) - 1:
                 dims = x_residual.shape
                 x_residual = layer["attention"](
                     x_residual[:, -1].view([dims[0], 1, dims[2]]).transpose(0, 1),
                     x_residual.transpose(0, 1),
                     x_residual.transpose(0, 1),
-                    mask
+                    mask,
                 )
-                attn_weights = x_residual[1]
                 x_residual = x_residual[0]
                 x = x[:, -1].view([dims[0], 1, dims[2]])
             else:
@@ -116,7 +130,7 @@ def forward(self, x):
                     x_residual.transpose(0, 1),
                     x_residual.transpose(0, 1),
                     x_residual.transpose(0, 1),
-                    mask
+                    mask,
                 )[0]
             x = self.end_residual(layer, "attention", x, x_residual.transpose(0, 1))
 
@@ -141,14 +155,15 @@ def end_residual(layer, stage, x, x_residual):
 
 
 class FeedForwardBlock(nn.Module):
-
-    def __init__(self,
-                 dim_token,
-                 dim_hidden,
-                 bias_first=True,
-                 bias_second=True,
-                 dropout=0.0,
-                 activation=ReGLU):
+    def __init__(
+        self,
+        dim_token,
+        dim_hidden,
+        bias_first=True,
+        bias_second=True,
+        dropout=0.0,
+        activation=ReGLU,
+    ):
         super(FeedForwardBlock, self).__init__()
         self.linear0 = nn.Linear(dim_token, int(dim_hidden * 2), bias=bias_first)
         self.activation = activation()
@@ -164,13 +179,7 @@ def forward(self, x):
 
 
 class Head(nn.Module):
-
-    def __init__(self,
-                 dim_in,
-                 bias,
-                 activation,
-                 normalization,
-                 dim_out):
+    def __init__(self, dim_in, bias, activation, normalization, dim_out):
         super(Head, self).__init__()
         self.normalization = normalization(dim_in)
         self.activation = activation()
@@ -183,17 +192,16 @@ def forward(self, x):
         x = self.linear(x)
         return x
 
-class ClassToken(nn.Module):
 
-    def __init__(self,
-                 dim_token):
+class ClassToken(nn.Module):
+    def __init__(self, dim_token):
         super(ClassToken, self).__init__()
         self.weight = nn.Parameter(torch.empty(dim_token, 1))
         nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
 
     def expand(self, dims):
         new_dims = [1] * (len(dims) - 1)
-        return self.weight.view(new_dims + [-1]).expand(dims +[-1])
+        return self.weight.view(new_dims + [-1]).expand(dims + [-1])
 
     def forward(self, x):
         return torch.cat([x, self.expand([x.shape[0], 1])], dim=1)
diff --git a/man/DeepPatientLevelPrediction.Rd b/man/DeepPatientLevelPrediction.Rd
index c97f163..9eb3b36 100644
--- a/man/DeepPatientLevelPrediction.Rd
+++ b/man/DeepPatientLevelPrediction.Rd
@@ -5,5 +5,6 @@
 \alias{DeepPatientLevelPrediction}
 \title{DeepPatientLevelPrediction}
 \description{
-A package containing deep learning extensions for developing prediction models using data in the OMOP CDM
+A package containing deep learning extensions for developing
+prediction models using data in the OMOP CDM
 }
diff --git a/man/setDefaultResNet.Rd b/man/setDefaultResNet.Rd
index c26f354..e1d7f46 100644
--- a/man/setDefaultResNet.Rd
+++ b/man/setDefaultResNet.Rd
@@ -16,6 +16,6 @@ setDefaultResNet(
 Creates settings for a default ResNet model
 }
 \details{
-Model architecture from by https://arxiv.org/abs/2106.11959 . 
+Model architecture from by https://arxiv.org/abs/2106.11959 .
 Hyperparameters chosen by a experience on a few prediction problems.
 }
diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd
index fcb33ab..b8424a3 100644
--- a/man/setEstimator.Rd
+++ b/man/setEstimator.Rd
@@ -28,7 +28,8 @@ setEstimator(
 
 \item{epochs}{how many epochs to train for}
 
-\item{device}{what device to train on, can be a string or a function to that evaluates
+\item{device}{what device to train on, can be a string or a function to
+that evaluates
 to the device during runtime}
 
 \item{optimizer}{which optimizer to use}
@@ -37,11 +38,15 @@ to the device during runtime}
 
 \item{criterion}{loss function to use}
 
-\item{earlyStopping}{If earlyStopping should be used which stops the training of your metric is not improving}
+\item{earlyStopping}{If earlyStopping should be used which stops the
+training of your metric is not improving}
 
-\item{metric}{either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. 
-Needs to be a list with function `fun`, mode either `min` or `max` and a `name`,
-`fun` needs to be a function that takes in prediction and labels and outputs a score.}
+\item{metric}{either `auc` or `loss` or a custom metric to use. This is the
+metric used for scheduler and earlyStopping.
+Needs to be a list with function `fun`, mode either `min` or `max` and a
+`name`,
+`fun` needs to be a function that takes in prediction and labels and
+outputs a score.}
 
 \item{seed}{seed to initialize weights of model with}
 }
diff --git a/man/setMultiLayerPerceptron.Rd b/man/setMultiLayerPerceptron.Rd
index f3676cd..a5f96d7 100644
--- a/man/setMultiLayerPerceptron.Rd
+++ b/man/setMultiLayerPerceptron.Rd
@@ -19,15 +19,18 @@ setMultiLayerPerceptron(
 \arguments{
 \item{numLayers}{Number of layers in network, default: 1:8}
 
-\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)}
+\item{sizeHidden}{Amount of neurons in each default layer,
+default: 2^(6:9) (64 to 512)}
 
-\item{dropout}{How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)}
+\item{dropout}{How much dropout to apply after first linear,
+default: seq(0, 0.3, 0.05)}
 
-\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)}
+\item{sizeEmbedding}{Size of embedding default: 2^(6:9) (64 to 512)}
 
 \item{estimatorSettings}{settings of Estimator created with `setEstimator`}
 
-\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'}
+\item{hyperParamSearch}{Which kind of hyperparameter search to use random
+sampling or exhaustive grid search. default: 'random'}
 
 \item{randomSample}{How many random samples from hyperparameter space to use}
 
diff --git a/man/setResNet.Rd b/man/setResNet.Rd
index fbe3d77..4f4c245 100644
--- a/man/setResNet.Rd
+++ b/man/setResNet.Rd
@@ -21,21 +21,28 @@ setResNet(
 \arguments{
 \item{numLayers}{Number of layers in network, default: 1:16}
 
-\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)}
+\item{sizeHidden}{Amount of neurons in each default layer, default:
+2^(6:10) (64 to 1024)}
 
-\item{hiddenFactor}{How much to grow the amount of neurons in each ResLayer, default: 1:4}
+\item{hiddenFactor}{How much to grow the amount of neurons in each
+ResLayer, default: 1:4}
 
-\item{residualDropout}{How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)}
+\item{residualDropout}{How much dropout to apply after last linear layer
+in ResLayer, default: seq(0, 0.3, 0.05)}
 
-\item{hiddenDropout}{How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)}
+\item{hiddenDropout}{How much dropout to apply after first linear layer
+in ResLayer, default: seq(0, 0.3, 0.05)}
 
-\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)}
+\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9)
+'(64 to 512)}
 
 \item{estimatorSettings}{created with ```setEstimator```}
 
-\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'}
+\item{hyperParamSearch}{Which kind of hyperparameter search to use random
+sampling or exhaustive grid search. default: 'random'}
 
-\item{randomSample}{How many random samples from hyperparameter space to use}
+\item{randomSample}{How many random samples from hyperparameter space
+to use}
 
 \item{randomSampleSeed}{Random seed to sample hyperparameter combinations}
 }
diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd
index 7de56c4..9afcbd5 100644
--- a/man/setTransformer.Rd
+++ b/man/setTransformer.Rd
@@ -26,7 +26,8 @@ setTransformer(
 
 \item{dimToken}{dimension of each token (embedding size)}
 
-\item{dimOut}{dimension of output, usually 1 for binary problems}
+\item{dimOut}{dimension of output, usually 1 for binary
+problems}
 
 \item{numHeads}{number of attention heads}
 
@@ -38,15 +39,19 @@ setTransformer(
 
 \item{dimHidden}{dimension of the feedworward block}
 
-\item{dimHiddenRatio}{dimension of the feedforward block as a ratio of dimToken (embedding size)}
+\item{dimHiddenRatio}{dimension of the feedforward block as a ratio
+of dimToken (embedding size)}
 
 \item{estimatorSettings}{created with `setEstimator`}
 
-\item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'}
+\item{hyperParamSearch}{what kind of hyperparameter search to do,
+default 'random'}
 
-\item{randomSample}{How many samples to use in hyperparameter search if random}
+\item{randomSample}{How many samples to use in hyperparameter
+search if random}
 
-\item{randomSampleSeed}{Random seed to sample hyperparameter combinations}
+\item{randomSampleSeed}{Random seed to sample hyperparameter
+combinations}
 }
 \description{
 A transformer model
diff --git a/man/TrainingCache.Rd b/man/trainingCache.Rd
similarity index 79%
rename from man/TrainingCache.Rd
rename to man/trainingCache.Rd
index c82bb23..77d2efc 100644
--- a/man/TrainingCache.Rd
+++ b/man/trainingCache.Rd
@@ -1,7 +1,7 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/TrainingCache-class.R
-\name{TrainingCache}
-\alias{TrainingCache}
+\name{trainingCache}
+\alias{trainingCache}
 \title{TrainingCache}
 \value{
 Whether the provided and cached parameter grid is identical
@@ -18,15 +18,15 @@ Parameter caching for training persistence and continuity
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
-\item \href{#method-TrainingCache-new}{\code{TrainingCache$new()}}
-\item \href{#method-TrainingCache-isParamGridIdentical}{\code{TrainingCache$isParamGridIdentical()}}
-\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}}
-\item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}}
-\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}}
-\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}}
-\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}}
-\item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}}
-\item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}}
+\item \href{#method-TrainingCache-new}{\code{trainingCache$new()}}
+\item \href{#method-TrainingCache-isParamGridIdentical}{\code{trainingCache$isParamGridIdentical()}}
+\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{trainingCache$saveGridSearchPredictions()}}
+\item \href{#method-TrainingCache-saveModelParams}{\code{trainingCache$saveModelParams()}}
+\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{trainingCache$getGridSearchPredictions()}}
+\item \href{#method-TrainingCache-isFull}{\code{trainingCache$isFull()}}
+\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{trainingCache$getLastGridSearchIndex()}}
+\item \href{#method-TrainingCache-dropCache}{\code{trainingCache$dropCache()}}
+\item \href{#method-TrainingCache-clone}{\code{trainingCache$clone()}}
 }
 }
 \if{html}{\out{<hr>}}
@@ -35,7 +35,7 @@ Parameter caching for training persistence and continuity
 \subsection{Method \code{new()}}{
 Creates a new training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$new(inDir)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$new(inDir)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -53,7 +53,7 @@ Creates a new training cache
 Checks whether the parameter grid in the model settings is identical to
 the cached parameters.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -70,7 +70,7 @@ the cached parameters.
 \subsection{Method \code{saveGridSearchPredictions()}}{
 Saves the grid search results to the training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -87,7 +87,7 @@ Saves the grid search results to the training cache
 \subsection{Method \code{saveModelParams()}}{
 Saves the parameter grid to the training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$saveModelParams(inModelParams)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$saveModelParams(inModelParams)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -104,7 +104,7 @@ Saves the parameter grid to the training cache
 \subsection{Method \code{getGridSearchPredictions()}}{
 Gets the grid search results from the training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$getGridSearchPredictions()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$getGridSearchPredictions()}\if{html}{\out{</div>}}
 }
 
 }
@@ -114,7 +114,7 @@ Gets the grid search results from the training cache
 \subsection{Method \code{isFull()}}{
 Check if cache is full
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$isFull()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$isFull()}\if{html}{\out{</div>}}
 }
 
 }
@@ -124,7 +124,7 @@ Check if cache is full
 \subsection{Method \code{getLastGridSearchIndex()}}{
 Gets the last index from the cached grid search
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$getLastGridSearchIndex()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$getLastGridSearchIndex()}\if{html}{\out{</div>}}
 }
 
 }
@@ -134,7 +134,7 @@ Gets the last index from the cached grid search
 \subsection{Method \code{dropCache()}}{
 Remove the training cache from the analysis path
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$dropCache()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$dropCache()}\if{html}{\out{</div>}}
 }
 
 }
@@ -144,7 +144,7 @@ Remove the training cache from the analysis path
 \subsection{Method \code{clone()}}{
 The objects of this class are cloneable with this method.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$clone(deep = FALSE)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$clone(deep = FALSE)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index 7cd0fee..3e95d1a 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -7,13 +7,13 @@ connectionDetails <- Eunomia::getEunomiaConnectionDetails()
 Eunomia::createCohorts(connectionDetails)
 
 covSet <- FeatureExtraction::createCovariateSettings(
-  useDemographicsGender = T,
-  useDemographicsAge = T,
-  useDemographicsRace = T,
-  useDemographicsEthnicity = T,
-  useDemographicsAgeGroup = T,
-  useConditionGroupEraLongTerm = T,
-  useDrugEraStartLongTerm = T,
+  useDemographicsGender = TRUE,
+  useDemographicsAge = TRUE,
+  useDemographicsRace = TRUE,
+  useDemographicsEthnicity = TRUE,
+  useDemographicsAgeGroup = TRUE,
+  useConditionGroupEraLongTerm = TRUE,
+  useDrugEraStartLongTerm = TRUE,
   endDays = -1
 )
 
@@ -30,10 +30,11 @@ databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
   cdmDatabaseName = "eunomia"
 )
 
-restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings(
-  firstExposureOnly = T,
-  washoutPeriod = 365
-)
+restrictPlpDataSettings <-
+  PatientLevelPrediction::createRestrictPlpDataSettings(
+    firstExposureOnly = TRUE,
+    washoutPeriod = 365
+  )
 
 plpData <- PatientLevelPrediction::getPlpData(
   databaseDetails = databaseDetails,
@@ -41,9 +42,24 @@ plpData <- PatientLevelPrediction::getPlpData(
   covariateSettings = covSet
 )
 
+# add age squared so I have more than one numerical feature
+plpData$covariateData$covariateRef <- plpData$covariateData$covariateRef %>%
+  dplyr::rows_append(data.frame(
+                                covariateId = 2002,
+                                covariateName = "Squared age",
+                                analysisId = 2,
+                                conceptId = 0), copy = TRUE)
+
+squaredAges <- plpData$covariateData$covariates %>%
+  dplyr::filter(covariateId == 1002) %>%
+  dplyr::mutate(covariateId = 2002,
+                covariateValue = .data$covariateValue**2)
+
+plpData$covariateData$covariates <- plpData$covariateData$covariates %>%
+  dplyr::rows_append(squaredAges)
 
 populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
-  requireTimeAtRisk = F,
+  requireTimeAtRisk = FALSE,
   riskWindowStart = 1,
   riskWindowEnd = 365
 )
@@ -66,33 +82,32 @@ mappedData <- PatientLevelPrediction::MapIds(
 )
 
 path <- system.file("python", package = "DeepPatientLevelPrediction")
-Dataset <- reticulate::import_from_path("Dataset", path = path)
+datasetClass <- reticulate::import_from_path("Dataset", path = path)
 if (is.null(attributes(mappedData)$path)) {
   # sqlite object
   attributes(mappedData)$path <- attributes(mappedData)$dbname
 }
 
-dataset <- Dataset$Data(
+dataset <- datasetClass$Data(
   data = reticulate::r_to_py(normalizePath(attributes(mappedData)$path)),
   labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount),
 )
-small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3)))
+smallDataset <- torch$utils$data$Subset(dataset,
+                                        (1:round(length(dataset) / 3)))
 
 modelSettings <- setResNet(
   numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
   residualDropout = c(0, 0.2), hiddenDropout = 0,
   sizeEmbedding = 16, hyperParamSearch = "random",
   randomSample = 2,
-  setEstimator(epochs=1,
+  setEstimator(epochs = 1,
                learningRate = 3e-4)
 )
-fitEstimatorPath <- file.path(testLoc, 'fitEstimator')
+fitEstimatorPath <- file.path(testLoc, "fitEstimator")
 if (!dir.exists(fitEstimatorPath)) {
   dir.create(fitEstimatorPath)
 }
-fitEstimatorResults <- fitEstimator(trainData$Train, 
-                                    modelSettings = modelSettings, 
-                                    analysisId = 1, 
-                                    analysisPath = fitEstimatorPath)
-
-
+fitEstimatorResults <- fitEstimator(trainData$Train,
+                                    modelSettings = modelSettings,
+                                    analysisId = 1,
+                                    analysisPath = fitEstimatorPath)
\ No newline at end of file
diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R
index 940f9d1..fd3ce92 100644
--- a/tests/testthat/test-Dataset.R
+++ b/tests/testthat/test-Dataset.R
@@ -1,7 +1,9 @@
 test_that("number of num and cat features sum correctly", {
   testthat::expect_equal(
-    length(dataset$get_numerical_features()) + length(dataset$get_cat_features()),
-    dplyr::n_distinct(mappedData$covariates %>% dplyr::collect() %>%
+    length(dataset$get_numerical_features()) +
+      length(dataset$get_cat_features()),
+    dplyr::n_distinct(mappedData$covariates %>%
+                        dplyr::collect() %>%
                         dplyr::pull(covariateId))
   )
 })
@@ -12,40 +14,112 @@ test_that("length of dataset correct", {
   expect_equal(length(dataset), dataset$num$shape[0])
   expect_equal(
     length(dataset),
-    dplyr::n_distinct(mappedData$covariates %>% 
-                        dplyr::collect() %>% dplyr::pull(rowId))
+    dplyr::n_distinct(mappedData$covariates %>%
+                        dplyr::collect() %>%
+                        dplyr::pull(.data$rowId))
   )
 })
 
 test_that(".getbatch works", {
-  batch_size <- 16
+  batchSize <- 16
   # get one sample
   out <- dataset[10]
-  
-  # output should be a list of two items, the batch in pos 1 and targets in pos 2,
+
+  # output should be a list of two items,
+  # the batch in pos 1 and targets in pos 2,
   # the batch is what goes to the model
   expect_equal(length(out), 2)
-  
+
   # targets should be binary
   expect_true(out[[2]]$item() %in% c(0, 1))
-  
+
   # shape of batch is correct
   expect_equal(length(out[[1]]), 2)
   expect_equal(out[[1]]$cat$shape[0], 1)
   expect_equal(out[[1]]$num$shape[0], 1)
-  
+
   # shape of target
   expect_equal(out[[2]]$shape$numel(), 1)
-  
+
   # get a whole batch
-  out <- dataset[10:(10 + batch_size)]
-  
+  out <- dataset[10:(10 + batchSize)]
+
   expect_equal(length(out), 2)
   expect_true(all(out[[2]]$numpy() %in% c(0, 1)))
-  
+
   expect_equal(length(out[[1]]), 2)
   expect_equal(out[[1]]$cat$shape[0], 16)
   expect_equal(out[[1]]$num$shape[0], 16)
-  
+
   expect_equal(out[[2]]$shape[0], 16)
 })
+
+test_that("Column order is preserved when features are missing", {
+  # important for transfer learning and external validation
+
+  reducedCovData <- Andromeda::copyAndromeda(trainData$Train$covariateData)
+
+  # remove one numerical and one categorical
+  numFeature <- 1002 # continous age
+  catFeature <- 4285898210 # a random common cat feature
+  reducedCovData$covariates <- trainData$Train$covariateData$covariates %>%
+    dplyr::filter(!(covariateId %in% c(numFeature, catFeature)))
+  reducedCovData$covariates <- trainData$Train$covariateData$covariates %>%
+    dplyr::filter(!(covariateId %in% c(numFeature, catFeature)))
+
+  mappedReducedData <- PatientLevelPrediction::MapIds(
+    reducedCovData,
+    mapping = mappedData$mapping
+  )
+
+  catColumn <- mappedData$mapping %>%
+    dplyr::filter(covariateId == catFeature) %>%
+    dplyr::pull("columnId")
+  numColumn <- mappedData$mapping %>%
+    dplyr::filter(covariateId == numFeature) %>%
+    dplyr::pull("columnId")
+
+  reducedDataset <- datasetClass$Data(
+    data =
+      reticulate::r_to_py(normalizePath(attributes(mappedReducedData)$dbname)),
+    labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount),
+    numerical_features = dataset$numerical_features$to_list()
+  )
+
+  # should have same number of columns
+  expect_equal(dataset$num$shape[[1]], reducedDataset$num$shape[[1]])
+
+  # all zeros in column with removed feature, -1 because r to py
+  expect_true(reducedDataset$num[, numColumn - 1]$sum()$item() == 0)
+
+  # all other columns are same
+  indexReduced <- !torch$isin(torch$arange(reducedDataset$num$shape[[1]]),
+                              numColumn - 1)
+  index <- !torch$isin(torch$arange(dataset$num$shape[[1]]),
+                       numColumn - 1)
+
+  expect_equal(reducedDataset$num[, indexReduced]$mean()$item(),
+               dataset$num[, index]$mean()$item())
+
+  # cat data should have same counts of all columnIds
+  # expect the one that was removed
+  # not same counts for removed feature
+  expect_false(isTRUE(all.equal((reducedDataset$cat == catColumn)$sum()$item(),
+                                (dataset$cat == catColumn)$sum()$item())))
+
+  # get counts
+  counts <- as.array(torch$unique(dataset$cat,
+                                  return_counts = TRUE)[[2]]$numpy())
+  counts <- counts[-(catColumn + 1)] # +1 because py_to_r
+  counts <- counts[-1]
+
+  reducedCounts <- as.array(torch$unique(reducedDataset$cat,
+                                         return_counts = TRUE)[[2]]$numpy())
+  reducedCounts <- reducedCounts[-(catColumn + 1)] # +1 because py_to_r
+  reducedCounts <- reducedCounts[-1]
+
+  expect_false(isTRUE(all.equal(counts, reducedCounts)))
+  expect_equal(dataset$get_cat_features()$max(),
+               reducedDataset$get_cat_features()$max())
+
+})
diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R
index b4dd0a4..ba82835 100644
--- a/tests/testthat/test-Estimator.R
+++ b/tests/testthat/test-Estimator.R
@@ -1,5 +1,5 @@
-catFeatures <- small_dataset$dataset$get_cat_features()$shape[[1]]
-numFeatures <- small_dataset$dataset$get_numerical_features()$shape[[1]]
+catFeatures <- smallDataset$dataset$get_cat_features()$max()
+numFeatures <- smallDataset$dataset$get_numerical_features()$max()
 
 modelParameters <- list(
   cat_features = catFeatures,
@@ -10,18 +10,20 @@ modelParameters <- list(
   hidden_factor = 2
 )
 
-estimatorSettings <- setEstimator(learningRate = 3e-4,
-                                 weightDecay = 0.0,
-                                 batchSize = 128,
-                                 epochs = 5,
-                                 device = 'cpu',
-                                 seed=42,
-                                 optimizer=torch$optim$AdamW,
-                                 criterion=torch$nn$BCEWithLogitsLoss,
-                                 metric='loss',
-                                 scheduler= list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau,
-                                                 params=list(patience=1)),
-                                 earlyStopping=NULL)
+estimatorSettings <-
+  setEstimator(learningRate = 3e-4,
+               weightDecay = 0.0,
+               batchSize = 128,
+               epochs = 5,
+               device = "cpu",
+               seed = 42,
+               optimizer = torch$optim$AdamW,
+               criterion = torch$nn$BCEWithLogitsLoss,
+               metric = "loss",
+               scheduler =
+               list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau,
+                    params = list(patience = 1)),
+               earlyStopping = NULL)
 
 modelType <- "ResNet"
 estimator <- createEstimator(modelType = modelType,
@@ -32,11 +34,12 @@ test_that("Estimator initialization works", {
 
   # count parameters in both instances
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  ResNet <- reticulate::import_from_path(modelType, path=path)[[modelType]] 
+  resNet <- reticulate::import_from_path(modelType, path = path)[[modelType]]
 
   testthat::expect_equal(
-    sum(reticulate::iterate(estimator$model$parameters(), function(x) x$numel())),
-    sum(reticulate::iterate(do.call(ResNet, modelParameters)$parameters(), 
+    sum(reticulate::iterate(estimator$model$parameters(),
+                            function(x) x$numel())),
+    sum(reticulate::iterate(do.call(resNet, modelParameters)$parameters(),
                             function(x) x$numel()))
   )
 
@@ -47,8 +50,8 @@ test_that("Estimator initialization works", {
 })
 
 test_that("Estimator detects wrong inputs", {
-  
-  testthat::expect_error(setEstimator(learningRate='notAuto'))
+
+  testthat::expect_error(setEstimator(learningRate = "notAuto"))
   testthat::expect_error(setEstimator(weightDecay = -1))
   testthat::expect_error(setEstimator(weightDecay = "text"))
   testthat::expect_error(setEstimator(batchSize = 0))
@@ -61,7 +64,7 @@ test_that("Estimator detects wrong inputs", {
 })
 
 sink(nullfile())
-estimator$fit(small_dataset, small_dataset)
+estimator$fit(smallDataset, smallDataset)
 sink()
 
 test_that("estimator fitting works", {
@@ -70,72 +73,74 @@ test_that("estimator fitting works", {
   expect_true(!is.null(estimator$best_score$loss))
   expect_true(!is.null(estimator$best_score$auc))
 
-  old_weights <- estimator$model$head$weight$mean()$item()
+  oldWeights <- estimator$model$head$weight$mean()$item()
 
   sink(nullfile())
-  estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule)
+  estimator$fit_whole_training_set(smallDataset, estimator$learn_rate_schedule)
   sink()
 
-  expect_equal(estimator$optimizer$param_groups[[1]]$lr, tail(estimator$learn_rate_schedule, 1)[[1]])
+  expect_equal(estimator$optimizer$param_groups[[1]]$lr,
+               tail(estimator$learn_rate_schedule, 1)[[1]])
 
-  new_weights <- estimator$model$head$weight$mean()$item()
+  newWeights <- estimator$model$head$weight$mean()$item()
 
   # model should be updated when refitting
-  expect_true(old_weights != new_weights)
+  expect_true(oldWeights != newWeights)
 
   estimator$save(testLoc, "estimator.pt")
 
   expect_true(file.exists(file.path(testLoc, "estimator.pt")))
-  
+
   sink(nullfile())
   preds <- estimator$predict_proba(dataset)
   sink()
-  
+
   expect_lt(max(preds), 1)
   expect_gt(min(preds), 0)
-  
+
   sink(nullfile())
-  classes <- estimator$predict(small_dataset, threshold = 0.5)
+  classes <- estimator$predict(smallDataset, threshold = 0.5)
   sink()
   expect_equal(all(unique(classes) %in% c(0, 1)), TRUE)
-  
+
   sink(nullfile())
-  classes <- estimator$predict(small_dataset$dataset)
+  classes <- estimator$predict(smallDataset$dataset)
   sink()
   expect_equal(all(unique(classes) %in% c(0, 1)), TRUE)
-  
+
   estimatorSettings <- setEstimator(learningRate = 3e-4,
                                     weightDecay = 0.0,
                                     batchSize = 128,
                                     epochs = 5,
-                                    device = 'cpu',
-                                    metric= "loss")
-  
-  estimator <- createEstimator(modelType=modelType,
-                               modelParameters=modelParameters,
-                               estimatorSettings=estimatorSettings)
-  
+                                    device = "cpu",
+                                    metric = "loss")
+
+  estimator <- createEstimator(modelType = modelType,
+                               modelParameters = modelParameters,
+                               estimatorSettings = estimatorSettings)
+
   sink(nullfile())
-  estimator$fit(small_dataset, small_dataset)
+  estimator$fit(smallDataset, smallDataset)
   sink()
-  
+
   expect_equal(estimator$metric$mode, "min")
   expect_equal(estimator$metric$name, "loss")
-  
+
   sink(nullfile())
-  estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule)
+  estimator$fit_whole_training_set(smallDataset, estimator$learn_rate_schedule)
   sink()
-  
+
   expect_equal(estimator$learn_rate_schedule[[estimator$best_epoch]],
                estimator$optimizer$param_groups[[1]]$lr)
-  
+
 })
 
 test_that("early stopping works", {
-  EarlyStopping <- reticulate::import_from_path("Estimator", path=path)$EarlyStopping
-  earlyStop <- EarlyStopping(patience = 3, delta = 0, verbose = FALSE)
-  
-  
+  earlyStopping <-
+    reticulate::import_from_path("Estimator", path = path)$EarlyStopping
+  earlyStop <- earlyStopping(patience = 3, delta = 0, verbose = FALSE)
+
+
   testthat::expect_true(is.null(earlyStop$best_score))
   testthat::expect_false(earlyStop$early_stop)
   earlyStop(0.5)
@@ -153,15 +158,19 @@ test_that("Estimator fit function works", {
   expect_equal(attr(fitEstimatorResults, "modelType"), "binary")
   expect_equal(attr(fitEstimatorResults, "saveType"), "file")
   fakeTrainData <- trainData
-  fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake"))
-  expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc))
+  fakeTrainData$train$covariateData <- list(fakeCovData = c("Fake"))
+  expect_error(fitEstimator(fakeTrainData$train,
+                            modelSettings, analysisId = 1,
+                            analysisPath = testLoc))
 })
 
 test_that("predictDeepEstimator works", {
 
   # input is an estimator and a dataset
   sink(nullfile())
-  predictions <- predictDeepEstimator(estimator, small_dataset, cohort = trainData$Train$labels)
+  predictions <- predictDeepEstimator(estimator,
+                                      smallDataset,
+                                      cohort = trainData$Train$labels)
   sink()
 
   expect_lt(max(predictions$value), 1)
@@ -181,22 +190,23 @@ test_that("predictDeepEstimator works", {
 })
 
 test_that("batchToDevice works", {
-  batch_to_device <- reticulate::import_from_path("Estimator", path=path)$batch_to_device
+  batchToDevice <- reticulate::import_from_path("Estimator",
+                                                path = path)$batch_to_device
   # since we can't test moving to gpu there is a fake device called meta
   # which we can use to test of the device is updated
   estimator$device <- "meta"
   b <- 1:10
-  batch <- batch_to_device(dataset[b], device=estimator$device)
-  
+  batch <- batchToDevice(dataset[b], device = estimator$device)
+
   devices <- lapply(
     lapply(unlist(batch, recursive = TRUE), function(x) x$device),
     function(x) x == torch$device(type = "meta")
   )
   # test that all are meta
   expect_true(all(devices == TRUE))
-  
-  numDevice <- batch_to_device(dataset[b][[1]]$num, device=estimator$device)
-  expect_true(numDevice$device==torch$device(type="meta"))
+
+  numDevice <- batchToDevice(dataset[b][[1]]$num, device = estimator$device)
+  expect_true(numDevice$device == torch$device(type = "meta"))
 })
 
 test_that("Estimator without earlyStopping works", {
@@ -205,19 +215,19 @@ test_that("Estimator without earlyStopping works", {
                                     weightDecay = 0.0,
                                     batchSize = 128,
                                     epochs = 1,
-                                    device = 'cpu',
+                                    device = "cpu",
                                     earlyStopping = NULL)
-  
+
   estimator2 <- createEstimator(modelType = modelType,
                                 modelParameters = modelParameters,
-                                estimatorSettings=estimatorSettings)
+                                estimatorSettings = estimatorSettings)
   sink(nullfile())
-  estimator2$fit(small_dataset, small_dataset)
+  estimator2$fit(smallDataset, smallDataset)
   sink()
-  
+
   expect_null(estimator2$early_stopper)
   expect_true(!is.null(estimator2$best_epoch))
-  
+
 })
 
 test_that("Early stopper can use loss and stops early", {
@@ -225,78 +235,89 @@ test_that("Early stopper can use loss and stops early", {
                                     weightDecay = 0.0,
                                     batchSize = 128,
                                     epochs = 10,
-                                    device = 'cpu',
-                                    earlyStopping =list(useEarlyStopping=TRUE,
-                                                        params = list(mode=c('min'), 
-                                                                      patience=1)),
-                                    metric = 'loss',
-                                    seed=42)
-  
+                                    device = "cpu",
+                                    earlyStopping =
+                                      list(useEarlyStopping = TRUE,
+                                           params = list(mode = c("min"),
+                                                         patience = 1)),
+                                    metric = "loss",
+                                    seed = 42)
+
   estimator <- createEstimator(modelType = modelType,
                                modelParameters = modelParameters,
                                estimatorSettings = estimatorSettings)
   sink(nullfile())
-  estimator$fit(small_dataset, small_dataset)
+  estimator$fit(smallDataset, smallDataset)
   sink()
-  
+
   expect_true(estimator$best_epoch < estimator$epochs)
-  
+
 })
 
-test_that('Custom metric in estimator works', {
-  
-  metric_fun <- function(predictions, labels)  {
-    pr <- PRROC::pr.curve(scores.class0 = torch$sigmoid(predictions)$numpy(), 
+test_that("Custom metric in estimator works", {
+
+  metricFun <- function(predictions, labels)  {
+    pr <- PRROC::pr.curve(scores.class0 = torch$sigmoid(predictions)$numpy(),
                           weights.class0 = labels$numpy())
     auprc <- pr$auc.integral
     reticulate::r_to_py(auprc)
   }
-    
+
   estimatorSettings <- setEstimator(learningRate = 3e-4,
                                     weightDecay = 0.0,
                                     batchSize = 128,
                                     device = "cpu",
                                     epochs = 1,
-                                    metric=list(fun=metric_fun,
-                                                name="auprc",
-                                                mode="max"))
+                                    metric = list(fun = metricFun,
+                                                  name = "auprc",
+                                                  mode = "max"))
   estimator <- createEstimator(modelType = modelType,
                                modelParameters = modelParameters,
                                estimatorSettings = estimatorSettings)
   expect_true(is.function(estimator$metric$fun))
   expect_true(is.character(estimator$metric$name))
   expect_true(estimator$metric$mode %in% c("min", "max"))
-  
+
   sink(nullfile())
-  estimator$fit(small_dataset, small_dataset)
+  estimator$fit(smallDataset, smallDataset)
   sink()
-  
-  expect_true(estimator$best_score[["auprc"]]>0)
+
+  expect_true(estimator$best_score[["auprc"]] > 0)
 
 })
 
-test_that("setEstimator with paramsToTune is correctly added to hyperparameters", {
-  estimatorSettings <- setEstimator(learningRate=c(3e-4,1e-3),
-                                    batchSize=128,
-                                    epochs=1,
-                                    device="cpu",
-                                    metric=c("auc", "auprc"),
-                                    earlyStopping = list(useEarlyStopping=TRUE,
-                                                         params=list(patience=c(4,10))))
+test_that("setEstimator with hyperparameters", {
+  estimatorSettings <-
+    setEstimator(learningRate = c(3e-4, 1e-3),
+                 batchSize = 128,
+                 epochs = 1,
+                 device = "cpu",
+                 metric = c("auc", "auprc"),
+                 earlyStopping = list(useEarlyStopping = TRUE,
+                                      params = list(patience = c(4, 10))))
   modelSettings <- setResNet(numLayers = 1, sizeHidden = 64,
                              hiddenFactor = 1, residualDropout = 0.2,
-                             hiddenDropout = 0.2,sizeEmbedding = 128,
+                             hiddenDropout = 0.2, sizeEmbedding = 128,
                              estimatorSettings = estimatorSettings,
                              hyperParamSearch = "grid")
-  
+
   expect_true(length(modelSettings$param) == 8)
-  expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.metric)))==2)
-  expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.learningRate)))==2)
-  expect_true(length(unique(lapply(modelSettings$param, function(x) x$estimator.earlyStopping.patience)))==2)
-  fitParams <- names(modelSettings$param[[1]])[grepl("^estimator", names(modelSettings$param[[1]]))]
-  
-  estimatorSettings2 <- fillEstimatorSettings(estimatorSettings, fitParams, paramSearch=modelSettings$param[[8]])
-  
+  expect_true(length(unique(lapply(modelSettings$param,
+                                   function(x) x$estimator.metric))) == 2)
+  expect_true(length(unique(lapply(modelSettings$param,
+                                   function(x) x$estimator.learningRate))) == 2)
+  expect_true(length(unique(lapply(modelSettings$param, function(x) {
+    x$estimator.earlyStopping.patience
+  }))) == 2)
+
+  fitParams <-
+    names(modelSettings$param[[1]])[grepl("^estimator",
+                                          names(modelSettings$param[[1]]))]
+
+  estimatorSettings2 <-
+    fillEstimatorSettings(estimatorSettings, fitParams,
+                          paramSearch = modelSettings$param[[8]])
+
   expect_equal(estimatorSettings2$learningRate, 1e-3)
   expect_equal(as.character(estimatorSettings2$metric), "auprc")
   expect_equal(estimatorSettings2$earlyStopping$params$patience, 10)
@@ -304,56 +325,69 @@ test_that("setEstimator with paramsToTune is correctly added to hyperparameters"
 
 test_that("device as a function argument works", {
   getDevice <- function() {
-    dev <- Sys.getenv("testDeepPLPDevice") 
-    if (dev == ""){
-      dev = "cpu"
-    } else{
+    dev <- Sys.getenv("testDeepPLPDevice")
+    if (dev == "") {
+      dev <- "cpu"
+    } else {
       dev
     }
   }
 
-  estimatorSettings <- setEstimator(device=getDevice,
+  estimatorSettings <- setEstimator(device = getDevice,
                                     learningRate = 3e-4)
-  
-  model <- setDefaultResNet(estimatorSettings = estimatorSettings) 
+
+  model <- setDefaultResNet(estimatorSettings = estimatorSettings)
   model$param[[1]]$catFeatures <- 10
 
   estimator <- createEstimator(modelType = modelType,
                                modelParameters = model$param[[1]],
                                estimatorSettings = estimatorSettings)
-  
+
   expect_equal(estimator$device, "cpu")
-  
+
   Sys.setenv("testDeepPLPDevice" = "meta")
-  
-  estimatorSettings <- setEstimator(device=getDevice,
+
+  estimatorSettings <- setEstimator(device = getDevice,
                                     learningRate = 3e-4)
-  
-  model <- setDefaultResNet(estimatorSettings = estimatorSettings) 
+
+  model <- setDefaultResNet(estimatorSettings = estimatorSettings)
   model$param[[1]]$catFeatures <- 10
-  
+
   estimator <- createEstimator(modelType = modelType,
                                modelParameters = model$param[[1]],
                                estimatorSettings = estimatorSettings)
-  
+
   expect_equal(estimator$device, "meta")
-  
+
   Sys.unsetenv("testDeepPLPDevice")
-  
-  })
 
-test_that("estimatorSettings can be saved and loaded with correct python objects", {
+})
+
+test_that("estimatorSettings can be saved and loaded with python objects", {
   settings <- setEstimator()
 
-  saveRDS(settings,file=file.path(testLoc, 'settings.RDS'))
-  
-  loadedSettings <- readRDS(file = file.path(testLoc, 'settings.RDS'))
-  
+  saveRDS(settings, file = file.path(testLoc, "settings.RDS"))
+
+  loadedSettings <- readRDS(file = file.path(testLoc, "settings.RDS"))
   optimizer <- loadedSettings$optimizer()
   scheduler <- loadedSettings$scheduler()
   criterion <- loadedSettings$criterion()
-  
+
   testthat::expect_false(reticulate::py_is_null_xptr(optimizer))
   testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun))
   testthat::expect_false(reticulate::py_is_null_xptr(criterion))
 })
+
+test_that("evaluation works on predictDeepEstimator output", {
+  
+  prediction <- predictDeepEstimator(plpModel = fitEstimatorResults,
+                                     data = trainData$Test,
+                                     cohort = trainData$Test$labels)
+  prediction$evaluationType <- 'Validation'
+
+  evaluation <- evaluatePlp(prediction, "evaluationType")
+  
+  expect_length(evaluation, 5)
+  expect_s3_class(evaluation, "plpEvaluation")
+  
+  })
\ No newline at end of file
diff --git a/tests/testthat/test-LRFinder.R b/tests/testthat/test-LRFinder.R
index 0a5d700..a42416b 100644
--- a/tests/testthat/test-LRFinder.R
+++ b/tests/testthat/test-LRFinder.R
@@ -1,77 +1,84 @@
-ResNet <- reticulate::import_from_path("ResNet", path)$ResNet
-lrFinderClass <- reticulate::import_from_path("LrFinder", path=path)$LrFinder
+resNet <- reticulate::import_from_path("ResNet", path)$ResNet
 
 test_that("LR scheduler that changes per batch works", {
-  
-  model <- ResNet(cat_features = 10L, num_features = 1L,
+
+  model <- resNet(cat_features = 10L, num_features = 1L,
                   size_embedding = 32L, size_hidden = 64L,
                   num_layers = 1L, hidden_factor = 1L)
-  optimizer <- torch$optim$AdamW(model$parameters(), lr=1e-7)
-  
-  ExponentialSchedulerPerBatch <- reticulate::import_from_path("LrFinder", 
-                                                               path=path)$ExponentialSchedulerPerBatch
-  scheduler <- ExponentialSchedulerPerBatch(optimizer,
-                          end_lr = 1e-2,
-                          num_iter = 5)
+  optimizer <- torch$optim$AdamW(model$parameters(), lr = 1e-7)
+
+
+  exponentialSchedulerPerBatch <-
+    reticulate::import_from_path("LrFinder",
+                                 path = path)$ExponentialSchedulerPerBatch
+  scheduler <- exponentialSchedulerPerBatch(optimizer,
+                                            end_lr = 1e-2,
+                                            num_iter = 5)
   expect_equal(scheduler$last_epoch, 0)
   expect_equal(scheduler$optimizer$param_groups[[1]]$lr, 1e-7)
-  
+
   for (i in 1:5) {
     optimizer$step()
     scheduler$step()
   }
-  
+
   expect_equal(scheduler$last_epoch, 5)
-  expect_equal(scheduler$optimizer$param_groups[[1]]$lr, (1e-7 * (0.01 / 1e-7) ^ (5 / 4)))
-  
+  expect_equal(scheduler$optimizer$param_groups[[1]]$lr,
+               (1e-7 * (0.01 / 1e-7) ^ (5 / 4)))
+
 })
 
 
 test_that("LR finder works", {
-  lrFinder <- createLRFinder(modelType="ResNet",
-                             modelParameters = list(cat_features=length(dataset$get_cat_features()),
-                                                    num_features=length(dataset$get_numerical_features()),
-                                                    size_embedding=32L,
-                                                    size_hidden=64L,
-                                                    num_layers=1L,
-                                                    hidden_factor=1L),
-                             estimatorSettings = setEstimator(batchSize = 32L,
-                                                              seed=42),
-                             lrSettings = list(minLr=3e-4,
-                                               maxLr=10.0,
-                                               numLr=20L,
-                                               divergenceThreshold=1.1))
-    
+  lrFinder <-
+    createLRFinder(modelType = "ResNet",
+                   modelParameters =
+                   list(cat_features =
+                        dataset$get_cat_features()$max(),
+                        num_features =
+                        dataset$get_numerical_features()$max(),
+                        size_embedding = 32L,
+                        size_hidden = 64L,
+                        num_layers = 1L,
+                        hidden_factor = 1L),
+                   estimatorSettings = setEstimator(batchSize = 32L,
+                                                    seed = 42),
+                   lrSettings = list(minLr = 3e-4,
+                                     maxLr = 10.0,
+                                     numLr = 20L,
+                                     divergenceThreshold = 1.1))
+
   lr <- lrFinder$get_lr(dataset)
-  
-  expect_true(lr<=10.0)
-  expect_true(lr>=3e-4)
+
+  expect_true(lr <= 10.0)
+  expect_true(lr >= 3e-4)
 })
 
 test_that("LR finder works with device specified by a function", {
-  
-  deviceFun <- function(){
-    dev = "cpu"
+
+  deviceFun <- function() {
+    dev <- "cpu"
+    dev
   }
-  lrFinder <- createLRFinder(model = "ResNet", 
-                             modelParameters = list(cat_features=length(dataset$get_cat_features()),
-                                                num_features=length(dataset$get_numerical_features()),
-                                                size_embedding=8L,
-                                                size_hidden=16L,
-                                                num_layers=1L,
-                                                hidden_factor=1L),
-                             estimatorSettings = setEstimator(batchSize=32L,
-                                                  seed = 42,
-                                                  device = deviceFun),
-                             lrSettings = list(minLr=3e-4,
-                                               maxLr=10.0,
-                                               numLr=20L,
-                                               divergenceThreshold=1.1)
-                             )
+  lrFinder <- createLRFinder(
+    model = "ResNet",
+    modelParameters =
+      list(cat_features = dataset$get_cat_features()$max(),
+           num_features = dataset$get_numerical_features()$max(),
+           size_embedding = 8L,
+           size_hidden = 16L,
+           num_layers = 1L,
+           hidden_factor = 1L),
+    estimatorSettings = setEstimator(batchSize = 32L,
+                                     seed = 42,
+                                     device = deviceFun),
+    lrSettings = list(minLr = 3e-4,
+                      maxLr = 10.0,
+                      numLr = 20L,
+                      divergenceThreshold = 1.1)
+  )
   lr <- lrFinder$get_lr(dataset)
-  
-  expect_true(lr<=10.0)
-  expect_true(lr>=3e-4)
-  
-  
-})
\ No newline at end of file
+
+  expect_true(lr <= 10.0)
+  expect_true(lr >= 3e-4)
+})
diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MLP.R
index 5cbd7a5..a470664 100644
--- a/tests/testthat/test-MLP.R
+++ b/tests/testthat/test-MLP.R
@@ -5,11 +5,11 @@ modelSettings <- setMultiLayerPerceptron(
   dropout = c(0.1),
   sizeEmbedding = 32,
   estimatorSettings = setEstimator(
-    learningRate=c(3e-4),
+    learningRate = c(3e-4),
     weightDecay = c(1e-6),
-    seed=42,
-    batchSize=128,
-    epochs=1
+    seed = 42,
+    batchSize = 128,
+    epochs = 1
   ),
   hyperParamSearch = "random",
   randomSample = 1
@@ -21,12 +21,13 @@ test_that("setMultiLayerPerceptron works", {
   testthat::expect_equal(modelSettings$fitFunction, "fitEstimator")
 
   testthat::expect_true(length(modelSettings$param) > 0)
-  
-  expect_error(setMultiLayerPerceptron(numLayers=1,
+
+  expect_error(setMultiLayerPerceptron(numLayers = 1,
                                        sizeHidden = 128,
-                                       dropout= 0.2,
+                                       dropout = 0.2,
                                        sizeEmbedding = 128,
-                                       estimatorSettings = setEstimator(learningRate=3e-4),
+                                       estimatorSettings =
+                                         setEstimator(learningRate = 3e-4),
                                        randomSample = 2))
 })
 
@@ -41,16 +42,17 @@ results <- tryCatch(
       analysisName = "Testing Deep Learning",
       populationSettings = populationSet,
       splitSettings = PatientLevelPrediction::createDefaultSplitSetting(),
-      sampleSettings = PatientLevelPrediction::createSampleSettings(), # none
-      featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none
+      sampleSettings = PatientLevelPrediction::createSampleSettings(),
+      featureEngineeringSettings =
+        PatientLevelPrediction::createFeatureEngineeringSettings(),
       preprocessSettings = PatientLevelPrediction::createPreprocessSettings(),
       executeSettings = PatientLevelPrediction::createExecuteSettings(
-        runSplitData = T,
-        runSampleData = F,
-        runfeatureEngineering = F,
-        runPreprocessData = T,
-        runModelDevelopment = T,
-        runCovariateSummary = F
+        runSplitData = TRUE,
+        runSampleData = FALSE,
+        runfeatureEngineering = FALSE,
+        runPreprocessData = TRUE,
+        runModelDevelopment = TRUE,
+        runCovariateSummary = FALSE
       ),
       saveDirectory = file.path(testLoc, "MLP")
     )
@@ -73,7 +75,9 @@ test_that("MLP with runPlp working checks", {
 
   # check prediction same size as pop
   testthat::expect_equal(nrow(results$prediction %>%
-    dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population))
+                                dplyr::filter(evaluationType %in% c("Train",
+                                                                    "Test"))),
+                         nrow(population))
 
   # check prediction between 0 and 1
   testthat::expect_gte(min(results$prediction$value), 0)
@@ -82,15 +86,15 @@ test_that("MLP with runPlp working checks", {
 
 
 test_that("MLP nn-module works ", {
-  MLP <- reticulate::import_from_path("MLP", path=path)$MLP
-  model <- MLP(
-    cat_features = 5, 
-    num_features = 1, 
+  mlp <- reticulate::import_from_path("MLP", path = path)$MLP
+  model <- mlp(
+    cat_features = 5,
+    num_features = 1,
     size_embedding = 5,
-    size_hidden = 16, 
+    size_hidden = 16,
     num_layers = 1,
     activation = torch$nn$ReLU,
-    normalization = torch$nn$BatchNorm1d, 
+    normalization = torch$nn$BatchNorm1d,
     dropout = 0.3
   )
 
@@ -100,8 +104,8 @@ test_that("MLP nn-module works ", {
   expect_equal(pars, 489)
 
   input <- list()
-  input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype=torch$long)
-  input$num <- torch$randn(10L, 1L, dtype=torch$float32)
+  input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long)
+  input$num <- torch$randn(10L, 1L, dtype = torch$float32)
 
 
   output <- model(input)
@@ -110,14 +114,14 @@ test_that("MLP nn-module works ", {
   expect_equal(output$shape[0], 10L)
 
   input$num <- NULL
-  model <- MLP(
-    cat_features = 5L, 
-    num_features = 0, 
+  model <- mlp(
+    cat_features = 5L,
+    num_features = 0,
     size_embedding = 5L,
-    size_hidden = 16L, 
+    size_hidden = 16L,
     num_layers = 1L,
     activation = torch$nn$ReLU,
-    normalization = torch$nn$BatchNorm1d, 
+    normalization = torch$nn$BatchNorm1d,
     dropout = 0.3,
     d_out = 1L
   )
@@ -129,49 +133,48 @@ test_that("MLP nn-module works ", {
 
 test_that("Errors are produced by settings function", {
   randomSample <- 2
-  
+
   expect_error(setMultiLayerPerceptron(
-    numLayers = 1,
-    sizeHidden = 128,
-    dropout = 0.0,
-    sizeEmbedding = 128,
-    hyperParamSearch = 'random',
-    estimatorSettings = setEstimator(
-      learningRate = 'auto',
-      weightDecay = c(1e-3),
-      batchSize = 1024,
-      epochs = 30,
-      device="cpu")))
-                           
+                                       numLayers = 1,
+                                       sizeHidden = 128,
+                                       dropout = 0.0,
+                                       sizeEmbedding = 128,
+                                       hyperParamSearch = "random",
+                                       estimatorSettings =
+                                         setEstimator(
+                                                      learningRate = "auto",
+                                                      weightDecay = c(1e-3),
+                                                      batchSize = 1024,
+                                                      epochs = 30,
+                                                      device = "cpu")))
 })
 
 test_that("Can upload results to database", {
-  cohortDefinitions = data.frame(
-    cohortName = c('blank1'), 
-    cohortId = c(1), 
-    json = c('json')
+  cohortDefinitions <- data.frame(
+    cohortName = c("blank1"),
+    cohortId = c(1),
+    json = c("json")
   )
-  
+
   sink(nullfile())
-  sqliteFile <- insertResultsToSqlite(resultLocation = file.path(testLoc, "MLP"),
-                           cohortDefinitions = cohortDefinitions)
+  sqliteFile <-
+    insertResultsToSqlite(resultLocation = file.path(testLoc, "MLP"),
+                          cohortDefinitions = cohortDefinitions)
   sink()
-  
+
   testthat::expect_true(file.exists(sqliteFile))
-  
-  cdmDatabaseSchema <- 'main'
-  ohdsiDatabaseSchema <- 'main'
+
+  cdmDatabaseSchema <- "main"
+  ohdsiDatabaseSchema <- "main"
   connectionDetails <- DatabaseConnector::createConnectionDetails(
-    dbms = 'sqlite',
+    dbms = "sqlite",
     server = sqliteFile
   )
   conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
-  targetDialect <- 'sqlite'
-  
+  targetDialect <- "sqlite"
+
   # check the results table is populated
-  sql <- 'select count(*) as N from main.performances;'
+  sql <- "select count(*) as N from main.performances;"
   res <- DatabaseConnector::querySql(conn, sql)
-  testthat::expect_true(res$N[1]>0)
-  
-  
-})
\ No newline at end of file
+  testthat::expect_true(res$N[1] > 0)
+})
diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R
index 29dfb8e..76e3d9f 100644
--- a/tests/testthat/test-ResNet.R
+++ b/tests/testthat/test-ResNet.R
@@ -6,33 +6,34 @@ resSet <- setResNet(
   residualDropout = 0.1,
   hiddenDropout = 0.1,
   sizeEmbedding = 32,
-  estimatorSettings = setEstimator(learningRate="auto",
+  estimatorSettings = setEstimator(learningRate = "auto",
                                    weightDecay = c(1e-6),
-                                   seed=42,
+                                   seed = 42,
                                    batchSize = 128,
-                                   epochs=1),
+                                   epochs = 1),
   hyperParamSearch = "random",
   randomSample = 1,
 )
 
 test_that("setResNet works", {
   testthat::expect_s3_class(object = resSet, class = "modelSettings")
-  
+
   testthat::expect_equal(resSet$fitFunction, "fitEstimator")
-  
+
   testthat::expect_true(length(resSet$param) > 0)
-  
+
   expect_error(setResNet(numLayers = 2,
                          sizeHidden = 32,
                          hiddenFactor = 2,
                          residualDropout = 0.1,
                          hiddenDropout = 0.1,
                          sizeEmbedding = 32,
-                         estimatorSettings = setEstimator(learningRate=c(3e-4),
-                                                          weightDecay = c(1e-6),
-                                                          seed=42,
-                                                          batchSize = 128,
-                                                          epochs=1),
+                         estimatorSettings =
+                           setEstimator(learningRate = c(3e-4),
+                                        weightDecay = c(1e-6),
+                                        seed = 42,
+                                        batchSize = 128,
+                                        epochs = 1),
                          hyperParamSearch = "random",
                          randomSample = 2))
 })
@@ -48,16 +49,17 @@ res2 <- tryCatch(
       analysisName = "Testing Deep Learning",
       populationSettings = populationSet,
       splitSettings = PatientLevelPrediction::createDefaultSplitSetting(),
-      sampleSettings = PatientLevelPrediction::createSampleSettings(), # none
-      featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none
+      sampleSettings = PatientLevelPrediction::createSampleSettings(),
+      featureEngineeringSettings =
+        PatientLevelPrediction::createFeatureEngineeringSettings(),
       preprocessSettings = PatientLevelPrediction::createPreprocessSettings(),
       executeSettings = PatientLevelPrediction::createExecuteSettings(
-        runSplitData = T,
-        runSampleData = F,
-        runfeatureEngineering = F,
-        runPreprocessData = T,
-        runModelDevelopment = T,
-        runCovariateSummary = F
+        runSplitData = TRUE,
+        runSampleData = FALSE,
+        runfeatureEngineering = FALSE,
+        runPreprocessData = TRUE,
+        runModelDevelopment = TRUE,
+        runCovariateSummary = FALSE
       ),
       saveDirectory = file.path(testLoc, "ResNet")
     )
@@ -71,17 +73,19 @@ sink()
 
 test_that("ResNet with runPlp working checks", {
   testthat::expect_false(is.null(res2))
-  
+
   # check structure
   testthat::expect_true("prediction" %in% names(res2))
   testthat::expect_true("model" %in% names(res2))
   testthat::expect_true("covariateSummary" %in% names(res2))
   testthat::expect_true("performanceEvaluation" %in% names(res2))
-  
+
   # check prediction same size as pop
   testthat::expect_equal(nrow(res2$prediction %>%
-                                dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population))
-  
+                                dplyr::filter(evaluationType %in% c("Train",
+                                                                    "Test"))),
+                         nrow(population))
+
   # check prediction between 0 and 1
   testthat::expect_gte(min(res2$prediction$value), 0)
   testthat::expect_lte(max(res2$prediction$value), 1)
@@ -89,45 +93,45 @@ test_that("ResNet with runPlp working checks", {
 
 
 test_that("ResNet nn-module works ", {
-  ResNet <- reticulate::import_from_path("ResNet", path=path)$ResNet
-  model <- ResNet(
-    cat_features = 5, 
-    num_features = 1, 
+  resNet <- reticulate::import_from_path("ResNet", path = path)$ResNet
+  model <- resNet(
+    cat_features = 5,
+    num_features = 1,
     size_embedding = 5,
-    size_hidden = 16, 
-    num_layers = 1, 
+    size_hidden = 16,
+    num_layers = 1,
     hidden_factor = 2,
     activation = torch$nn$ReLU,
-    normalization = torch$nn$BatchNorm1d, 
+    normalization = torch$nn$BatchNorm1d,
     hidden_dropout = 0.3,
     residual_dropout = 0.3
   )
-  
+
   pars <- sum(reticulate::iterate(model$parameters(), function(x) x$numel()))
-  
+
   # expected number of parameters
   expect_equal(pars, 1295)
-  
+
   input <- list()
   input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long)
   input$num <- torch$randn(10L, 1L, dtype = torch$float32)
-  
-  
+
+
   output <- model(input)
-  
+
   # output is correct shape
   expect_equal(output$shape[0], 10L)
-  
+
   input$num <- NULL
-  model <- ResNet(
-    cat_features = 5, 
-    num_features = 0, 
+  model <- resNet(
+    cat_features = 5,
+    num_features = 0,
     size_embedding = 5,
-    size_hidden = 16, 
-    num_layers = 1, 
+    size_hidden = 16,
+    num_layers = 1,
     hidden_factor = 2,
     activation = torch$nn$ReLU,
-    normalization = torch$nn$BatchNorm1d, 
+    normalization = torch$nn$BatchNorm1d,
     hidden_dropout = 0.3,
     residual_dropout = 0.3
   )
@@ -139,60 +143,59 @@ test_that("ResNet nn-module works ", {
 test_that("Default Resnet works", {
   defaultResNet <- setDefaultResNet()
   params <- defaultResNet$param[[1]]
-  
+
   expect_equal(params$numLayers, 6)
   expect_equal(params$sizeHidden, 512)
   expect_equal(params$hiddenFactor, 2)
   expect_equal(params$residualDropout, 0.1)
   expect_equal(params$hiddenDropout, 0.4)
   expect_equal(params$sizeEmbedding, 256)
-  
-}) 
+
+})
 
 test_that("Errors are produced by settings function", {
   randomSample <- 2
-  
-  expect_error(setResNet(
-    numLayers = 1,
-    sizeHidden = 128,
-    hiddenFactor = 1,
-    residualDropout = 0.0,
-    hiddenDropout = 0.0,
-    sizeEmbedding = 128,
-    estimatorSettings = setEstimator(weightDecay = 1e-6,
-                                     learningRate = 0.01,
-                                     seed = 42),
-    hyperParamSearch = 'random',
-    randomSample = randomSample))
+
+  expect_error(setResNet(numLayers = 1,
+                         sizeHidden = 128,
+                         hiddenFactor = 1,
+                         residualDropout = 0.0,
+                         hiddenDropout = 0.0,
+                         sizeEmbedding = 128,
+                         estimatorSettings = setEstimator(weightDecay = 1e-6,
+                                                          learningRate = 0.01,
+                                                          seed = 42),
+                         hyperParamSearch = "random",
+                         randomSample = randomSample))
 })
 
 
-test_that("Can upload results to database", { 
-  cohortDefinitions = data.frame(
-    cohortName = c('blank1'), 
-    cohortId = c(1), 
-    json = c('json')
+test_that("Can upload results to database", {
+  cohortDefinitions <- data.frame(
+    cohortName = c("blank1"),
+    cohortId = c(1),
+    json = c("json")
   )
-  
+
   sink(nullfile())
-  sqliteFile <- insertResultsToSqlite(resultLocation = file.path(testLoc, "ResNet"),
-                                      cohortDefinitions = cohortDefinitions)
+  sqliteFile <-
+    insertResultsToSqlite(resultLocation = file.path(testLoc, "ResNet"),
+                          cohortDefinitions = cohortDefinitions)
   sink()
-  
+
   testthat::expect_true(file.exists(sqliteFile))
-  
-  cdmDatabaseSchema <- 'main'
-  ohdsiDatabaseSchema <- 'main'
+
+  cdmDatabaseSchema <- "main"
+  ohdsiDatabaseSchema <- "main"
   connectionDetails <- DatabaseConnector::createConnectionDetails(
-    dbms = 'sqlite',
+    dbms = "sqlite",
     server = sqliteFile
   )
   conn <- DatabaseConnector::connect(connectionDetails = connectionDetails)
-  targetDialect <- 'sqlite'
-  
+  targetDialect <- "sqlite"
+
   # check the results table is populated
-  sql <- 'select count(*) as N from main.performances;'
+  sql <- "select count(*) as N from main.performances;"
   res <- DatabaseConnector::querySql(conn, sql)
-  testthat::expect_true(res$N[1]>0)
+  testthat::expect_true(res$N[1] > 0)
 })
-
diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R
index debe95c..ec5b063 100644
--- a/tests/testthat/test-TrainingCache.R
+++ b/tests/testthat/test-TrainingCache.R
@@ -4,75 +4,83 @@ resNetSettings <- setResNet(numLayers = c(1, 2, 4),
                             residualDropout = 0.5,
                             hiddenDropout = 0.5,
                             sizeEmbedding = 64,
-                            estimatorSettings = setEstimator(learningRate=3e-4,
-                                                             weightDecay=1e-3,
-                                                             device='cpu',
-                                                             batchSize=64,
-                                                             epochs=1,
-                                                             seed=NULL),
+                            estimatorSettings =
+                              setEstimator(learningRate = 3e-4,
+                                           weightDecay = 1e-3,
+                                           device = "cpu",
+                                           batchSize = 64,
+                                           epochs = 1,
+                                           seed = NULL),
                             hyperParamSearch = "random",
                             randomSample = 3,
                             randomSampleSeed = NULL)
 
-trainCache <- TrainingCache$new(testLoc)
+trainCache <- trainingCache$new(testLoc)
 paramSearch <- resNetSettings$param
 
 test_that("Training cache exists on disk", {
   testthat::expect_true(
-    file.exists(file.path(testLoc, "paramPersistence.rds")))
+                        file.exists(file.path(testLoc, "paramPersistence.rds")))
 })
 
 test_that("Grid search can be cached", {
   gridSearchPredictons <- list()
   length(gridSearchPredictons) <- length(paramSearch)
   trainCache$saveGridSearchPredictions(gridSearchPredictons)
-  
+
   index <- 1
-  
+
   gridSearchPredictons[[index]] <- list(
     prediction = list(NULL),
     param = paramSearch[[index]]
   )
   trainCache$saveGridSearchPredictions(gridSearchPredictons)
 
-  testthat::expect_identical(trainCache$getGridSearchPredictions(), gridSearchPredictons)
-  testthat::expect_equal(trainCache$getLastGridSearchIndex(), index+1)
+  testthat::expect_identical(trainCache$getGridSearchPredictions(),
+                             gridSearchPredictons)
+  testthat::expect_equal(trainCache$getLastGridSearchIndex(), index + 1)
 })
 
 test_that("Param grid predictions can be cached", {
   testthat::expect_false(trainCache$isParamGridIdentical(paramSearch))
-  
+
   trainCache$saveModelParams(paramSearch)
   testthat::expect_true(trainCache$isParamGridIdentical(paramSearch))
 })
 
 test_that("Estimator can resume training from cache", {
   trainCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds"))
-  newPath <- file.path(testLoc, 'resume')
+  newPath <- file.path(testLoc, "resume")
   dir.create(newPath)
-  
+
   # remove last row
   trainCache$gridSearchPredictions[[2]] <- NULL
   length(trainCache$gridSearchPredictions) <- 2
-  
+
   # save new cache
-  saveRDS(trainCache, file=file.path(newPath, "paramPersistence.rds"))
-  
+  saveRDS(trainCache, file = file.path(newPath, "paramPersistence.rds"))
+
   sink(nullfile())
-  fitEstimatorResults <- fitEstimator(trainData$Train, 
-                                      modelSettings = modelSettings, 
-                                      analysisId = 1, 
+  fitEstimatorResults <- fitEstimator(trainData$Train,
+                                      modelSettings = modelSettings,
+                                      analysisId = 1,
                                       analysisPath = newPath)
   sink()
- 
   newCache <- readRDS(file.path(newPath, "paramPersistence.rds"))
-  testthat::expect_equal(nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary), 4)
+  cS <- nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary)
+  testthat::expect_equal(cS, 4)
 })
 
 test_that("Prediction is cached for optimal parameters", {
   testCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds"))
-  indexOfMax <- which.max(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance)))
-  indexOfMin <- which.min(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance)))
-  testthat::expect_equal(class(testCache$gridSearchPredictions[[indexOfMax]]$prediction), class(data.frame()))
-  testthat::expect_null(testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]])
+  indexOfMax <-
+    which.max(unlist(lapply(testCache$gridSearchPredictions,
+                            function(x) x$gridPerformance$cvPerformance)))
+  indexOfMin <-
+    which.min(unlist(lapply(testCache$gridSearchPredictions,
+                            function(x) x$gridPerformance$cvPerformance)))
+  myClass <- class(testCache$gridSearchPredictions[[indexOfMax]]$prediction)
+  testthat::expect_equal(myClass, class(data.frame()))
+  lowestIndex <- testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]]
+  testthat::expect_null(lowestIndex)
 })
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
index 043cbe7..62fdf12 100644
--- a/tests/testthat/test-Transformer.R
+++ b/tests/testthat/test-Transformer.R
@@ -1,15 +1,15 @@
 settings <- setTransformer(
-  numBlocks = 1, 
-  dimToken = 8, 
+  numBlocks = 1,
+  dimToken = 8,
   dimOut = 1,
-  numHeads = 2, 
-  attDropout = 0.0, 
+  numHeads = 2,
+  attDropout = 0.0,
   ffnDropout = 0.2,
-  resDropout = 0.0, 
-  dimHidden = 32, 
+  resDropout = 0.0,
+  dimHidden = 32,
   estimatorSettings = setEstimator(learningRate = 3e-4,
-                                   batchSize=64,
-                                   epochs=1),
+                                   batchSize = 64,
+                                   epochs = 1),
   randomSample = 1
 )
 
@@ -32,7 +32,11 @@ test_that("Transformer settings work", {
 })
 
 test_that("fitEstimator with Transformer works", {
-  results <- fitEstimator(trainData$Train, settings, analysisId = 1, analysisPath = testLoc)
+  results <-
+    fitEstimator(trainData$Train,
+                 settings,
+                 analysisId = 1,
+                 analysisPath = testLoc)
 
   expect_equal(class(results), "plpModel")
   expect_equal(attr(results, "modelType"), "binary")
@@ -44,16 +48,17 @@ test_that("fitEstimator with Transformer works", {
 })
 
 test_that("transformer nn-module works", {
-  Transformer <- reticulate::import_from_path("Transformer", path=path)$Transformer
-  model <- Transformer(
-    cat_features = 5, 
-    num_features = 1, 
+  transformer <-
+    reticulate::import_from_path("Transformer", path = path)$Transformer
+  model <- transformer(
+    cat_features = 5,
+    num_features = 1,
     num_blocks = 2,
-    dim_token = 16, 
-    num_heads = 2, 
-    att_dropout = 0, 
+    dim_token = 16,
+    num_heads = 2,
+    att_dropout = 0,
     ffn_dropout = 0,
-    res_dropout = 0, 
+    res_dropout = 0,
     dim_hidden = 32
   )
 
@@ -73,15 +78,15 @@ test_that("transformer nn-module works", {
 
   input$num <- NULL
 
-  model <- Transformer(
-    cat_features = 5, 
-    num_features = 0, 
+  model <- transformer(
+    cat_features = 5,
+    num_features = 0,
     num_blocks = 2,
-    dim_token = 16, 
-    num_heads = 2, 
-    att_dropout = 0, 
+    dim_token = 16,
+    num_heads = 2,
+    att_dropout = 0,
     ffn_dropout = 0,
-    res_dropout = 0, 
+    res_dropout = 0,
     dim_hidden = 32
   )
   output <- model(input)
@@ -89,26 +94,29 @@ test_that("transformer nn-module works", {
   input$num <- reticulate::py_none()
   output <- model(input)
   expect_equal(output$shape[0], 10L)
+  input$num <- reticulate::py_none()
+  output <- model(input)
+  expect_equal(output$shape[0], 10L)
 })
 
 test_that("Default Transformer works", {
   defaultTransformer <- setDefaultTransformer()
   params <- defaultTransformer$param[[1]]
-  
+
   expect_equal(params$numBlocks, 3)
   expect_equal(params$dimToken, 192)
   expect_equal(params$numHeads, 8)
   expect_equal(params$resDropout, 0.0)
   expect_equal(params$attDropout, 0.2)
-  
-  settings <- attr(defaultTransformer, 'settings')
-  
-  expect_equal(settings$name, 'defaultTransformer')
-}) 
+
+  settings <- attr(defaultTransformer, "settings")
+
+  expect_equal(settings$name, "defaultTransformer")
+})
 
 test_that("Errors are produced by settings function", {
   randomSample <- 2
-  
+
   expect_error(setTransformer(randomSample = randomSample))
 })
 
@@ -126,7 +134,37 @@ test_that("dimHidden ratio works as expected", {
   testthat::expect_error(setTransformer(dimHidden = NULL,
                                         dimHiddenRatio = NULL))
   testthat::expect_error(setTransformer(dimHidden = 256,
-                                        dimHiddenRatio = 4/3))
+                                        dimHiddenRatio = 4 / 3))
+
+})
+
+test_that("numerical embedding works as expected", {
+  embeddings <- 32L # size of embeddings
+  features <- 2L # number of numerical features
+  patients <- 9L
+
+  numTensor <- torch$randn(c(patients, features))
+
+  numericalEmbeddingClass <-
+    reticulate::import_from_path("ResNet", path = path)$NumericalEmbedding
+  numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features,
+                                                embedding_dim = embeddings,
+                                                bias = TRUE)
+  out <- numericalEmbedding(numTensor)
+
+  # should be patients x features x embedding size
+  expect_equal(out$shape[[0]], patients)
+  expect_equal(out$shape[[1]], features)
+  expect_equal(out$shape[[2]], embeddings)
+
+  numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features,
+                                                embedding_dim = embeddings,
+                                                bias = FALSE)
+
+  out <- numericalEmbedding(numTensor)
+  expect_equal(out$shape[[0]], patients)
+  expect_equal(out$shape[[1]], features)
+  expect_equal(out$shape[[2]], embeddings)
 
 })