From 4430e1358dfcb8af7f768cb790e09454bbf6dd94 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Wed, 7 Sep 2022 14:55:17 +0200
Subject: [PATCH 01/13] Fix randomSample default value for transformer and add
 error messages if randomSamples is higher than amount of hyperparameters

---
 R/MLP.R         | 7 ++++++-
 R/ResNet.R      | 6 ++++++
 R/Transformer.R | 8 +++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/R/MLP.R b/R/MLP.R
index dc5f56a..29659c1 100644
--- a/R/MLP.R
+++ b/R/MLP.R
@@ -66,7 +66,12 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
   )
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
-
+  if (randomSamples>length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+               "\n randomSamples:", randomSamples,"\n Possible hyperparameter combinations:", length(param),
+               "\n Please lower the amount of randomSamples"))
+  }
+  
   if (hyperParamSearch == "random") {
     param <- param[sample(length(param), randomSample)]
   }
diff --git a/R/ResNet.R b/R/ResNet.R
index e8ce322..a19660e 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -73,6 +73,12 @@ setResNet <- function(numLayers = c(1:8),
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
+  if (randomSamples>length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+               "\n randomSamples:", randomSamples,"\n Possible hyperparameter combinations:", length(param),
+               "\n Please lower the amount of randomSamples"))
+  }
+  
   if (hyperParamSearch == "random") {
     param <- param[sample(length(param), randomSample)]
   }
diff --git a/R/Transformer.R b/R/Transformer.R
index 0ab7227..d799b9d 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -26,7 +26,7 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
                            resDropout = 0, dimHidden = 512, weightDecay = 1e-6,
                            learningRate = 3e-4, batchSize = 1024,
                            epochs = 10, device = "cpu", hyperParamSearch = "random",
-                           randomSamples = 100, seed = NULL) {
+                           randomSamples = 1, seed = NULL) {
   if (is.null(seed)) {
     seed <- as.integer(sample(1e5, 1))
   }
@@ -54,6 +54,12 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
+  if (randomSamples>length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+               "\n randomSamples:", randomSamples,"\n Possible hyperparameter combinations:", length(param),
+               "\n Please lower the amount of randomSamples"))
+  }
+  
   if (hyperParamSearch == "random") {
     param <- param[sample(length(param), randomSamples)]
   }

From 9dd84bde981b82e29ec8bcc28d610e1abf64f0e8 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Wed, 7 Sep 2022 14:55:17 +0200
Subject: [PATCH 02/13] Fix randomSample default value for transformer and add
 error messages if randomSamples is higher than amount of hyperparameters

---
 R/MLP.R         | 7 ++++++-
 R/ResNet.R      | 6 ++++++
 R/Transformer.R | 8 +++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/R/MLP.R b/R/MLP.R
index dc5f56a..7df166a 100644
--- a/R/MLP.R
+++ b/R/MLP.R
@@ -66,7 +66,12 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
   )
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
-
+  if (randomSample>length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
+               "\n Please lower the amount of randomSamples"))
+  }
+  
   if (hyperParamSearch == "random") {
     param <- param[sample(length(param), randomSample)]
   }
diff --git a/R/ResNet.R b/R/ResNet.R
index e8ce322..fafb40b 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -73,6 +73,12 @@ setResNet <- function(numLayers = c(1:8),
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
+  if (randomSample>length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
+               "\n Please lower the amount of randomSamples"))
+  }
+  
   if (hyperParamSearch == "random") {
     param <- param[sample(length(param), randomSample)]
   }
diff --git a/R/Transformer.R b/R/Transformer.R
index 0ab7227..ec4d469 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -26,7 +26,7 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
                            resDropout = 0, dimHidden = 512, weightDecay = 1e-6,
                            learningRate = 3e-4, batchSize = 1024,
                            epochs = 10, device = "cpu", hyperParamSearch = "random",
-                           randomSamples = 100, seed = NULL) {
+                           randomSample = 1, seed = NULL) {
   if (is.null(seed)) {
     seed <- as.integer(sample(1e5, 1))
   }
@@ -54,6 +54,12 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
+  if (randomSample>length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
+               "\n Please lower the amount of randomSample"))
+  }
+  
   if (hyperParamSearch == "random") {
     param <- param[sample(length(param), randomSamples)]
   }

From d3f35ae434da6acf1c63196f8012421bd7430b33 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Wed, 7 Sep 2022 16:15:11 +0200
Subject: [PATCH 03/13] fix docs

---
 man/setTransformer.Rd | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd
index 54927b0..500f548 100644
--- a/man/setTransformer.Rd
+++ b/man/setTransformer.Rd
@@ -19,7 +19,7 @@ setTransformer(
   epochs = 10,
   device = "cpu",
   hyperParamSearch = "random",
-  randomSamples = 100,
+  randomSample = 1,
   seed = NULL
 )
 }
@@ -52,9 +52,9 @@ setTransformer(
 
 \item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'}
 
-\item{randomSamples}{How many samples to use in hyperparameter search if random}
-
 \item{seed}{Random seed to use}
+
+\item{randomSamples}{How many samples to use in hyperparameter search if random}
 }
 \description{
 A transformer model

From 4b77866880b3b9bb834983f6efb1a38c84cf674b Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Wed, 7 Sep 2022 16:27:23 +0200
Subject: [PATCH 04/13] removed lingering randomSamples

---
 R/Transformer.R                   | 4 ++--
 man/setTransformer.Rd             | 2 +-
 tests/testthat/test-Transformer.R | 2 +-
 vignettes/BuildingDeepModels.Rmd  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/Transformer.R b/R/Transformer.R
index ec4d469..15ce545 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -17,7 +17,7 @@
 #' @param epochs                  How many epochs to run the model for
 #' @param device                  Which device to use, cpu or cuda
 #' @param hyperParamSearch        what kind of hyperparameter search to do, default 'random'
-#' @param randomSamples           How many samples to use in hyperparameter search if random
+#' @param randomSample            How many samples to use in hyperparameter search if random
 #' @param seed                    Random seed to use
 #'
 #' @export
@@ -61,7 +61,7 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
   }
   
   if (hyperParamSearch == "random") {
-    param <- param[sample(length(param), randomSamples)]
+    param <- param[sample(length(param), randomSample)]
   }
 
   attr(param, "settings") <- list(
diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd
index 500f548..9ef10ea 100644
--- a/man/setTransformer.Rd
+++ b/man/setTransformer.Rd
@@ -54,7 +54,7 @@ setTransformer(
 
 \item{seed}{Random seed to use}
 
-\item{randomSamples}{How many samples to use in hyperparameter search if random}
+\item{randomSample}{How many samples to use in hyperparameter search if random}
 }
 \description{
 A transformer model
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
index 913e687..60547de 100644
--- a/tests/testthat/test-Transformer.R
+++ b/tests/testthat/test-Transformer.R
@@ -2,7 +2,7 @@ settings <- setTransformer(
   numBlocks = 1, dimToken = 8, dimOut = 1,
   numHeads = 2, attDropout = 0.0, ffnDropout = 0.2,
   resDropout = 0.0, dimHidden = 32, batchSize = 64,
-  epochs = 1, randomSamples = 1
+  epochs = 1, randomSample = 1
 )
 
 test_that("Transformer settings work", {
diff --git a/vignettes/BuildingDeepModels.Rmd b/vignettes/BuildingDeepModels.Rmd
index 8ac9925..41ee944 100644
--- a/vignettes/BuildingDeepModels.Rmd
+++ b/vignettes/BuildingDeepModels.Rmd
@@ -454,7 +454,7 @@ modelSettings <- setTransformer(numBlocks = 3,
                                 batchSize = 128,
                                 epochs = 10,
                                 device = 'cpu',  # or 'cuda' for GPU
-                                randomSamples = 1)
+                                randomSample = 1)
 
 
 TransformerResult <- PatientLevelPrediction::runPlp(

From 437699755fcc5d341bf6fcc4c6d48dae34b754e3 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Sun, 9 Oct 2022 14:34:38 +0200
Subject: [PATCH 05/13] Resnet work both for concating numerics to embeddings
 or adding embeddings of numerics to other embeddings

---
 R/ResNet.R | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/R/ResNet.R b/R/ResNet.R
index fafb40b..ac9200f 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -112,14 +112,17 @@ ResNet <- torch::nn_module(
   initialize = function(catFeatures, numFeatures = 0, sizeEmbedding, sizeHidden, numLayers,
                         hiddenFactor, activation = torch::nn_relu,
                         normalization = torch::nn_batch_norm1d, hiddenDropout = NULL,
-                        residualDropout = NULL, d_out = 1) {
+                        residualDropout = NULL, d_out = 1, concatNum=TRUE) {
     self$embedding <- torch::nn_embedding_bag(
       num_embeddings = catFeatures + 1,
       embedding_dim = sizeEmbedding,
       padding_idx = 1
     )
-    if (numFeatures != 0) {
+    if (numFeatures != 0 & concatNum != TRUE) {
       self$numEmbedding <- numericalEmbedding(numFeatures, sizeEmbedding)
+    } else {
+      self$numEmbedding <- NULL
+      sizeEmbedding <- sizeEmbedding + numFeatures
     }
 
     self$first_layer <- torch::nn_linear(sizeEmbedding, sizeHidden)
@@ -146,9 +149,10 @@ ResNet <- torch::nn_module(
     x_cat <- x$cat
     x_num <- x$num
     x_cat <- self$embedding(x_cat + 1L) # padding_idx is 1
-    if (!is.null(x_num)) {
+    if (!is.null(x_num) & (!is.null(self$numEmbedding))) {
       x <- (x_cat + self$numEmbedding(x_num)$mean(dim = 2)) / 2
-      # x <- torch::torch_cat(list(x_cat, x_num), dim = 2L)
+    } else if (!is.null(x_num) & is.null(self$numEmbedding)) {
+      x <- torch::torch_cat(list(x_cat, x_num), dim = 2L)
     } else {
       x <- x_cat
     }

From 5a92835b596aec46ac34fc5282b4f08d43974368 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Sun, 9 Oct 2022 14:35:57 +0200
Subject: [PATCH 06/13] update description

---
 DESCRIPTION   |  6 +++---
 R/Estimator.R | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8fa61a7..b135257 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
 Version: 1.0.0
-Date: 29-08-2022
+Date: 09-10-2022
 Authors@R: c(
     person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut")), 
     person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")),
@@ -23,7 +23,7 @@ Imports:
     data.table,
     FeatureExtraction (>= 3.0.0),
     ParallelLogger (>= 2.0.0),
-    PatientLevelPrediction,
+    PatientLevelPrediction, (>= 6.0.4)
     rlang,
     torch (>= 0.8.0)
 Suggests:
@@ -34,7 +34,7 @@ Suggests:
     plyr,  
     testthat
 Remotes:
-    ohdsi/PatientLevelPrediction@develop,
+    ohdsi/PatientLevelPrediction,
     ohdsi/FeatureExtraction,
     ohdsi/Eunomia
 RoxygenNote: 7.2.1
diff --git a/R/Estimator.R b/R/Estimator.R
index 8abb048..0ad632b 100644
--- a/R/Estimator.R
+++ b/R/Estimator.R
@@ -27,7 +27,6 @@
 #' @param optimizer             which optimizer to use
 #' @param scheduler             which learning rate scheduler to use
 #' @param criterion             loss function to use
-#' @param posWeight             If more weight should be added to positive labels during training - will result in miscalibrated models
 #' @param earlyStopping         If earlyStopping should be used which stops the training of your metric is not improving
 #' @param earlyStoppingMetric   Which parameter to use for early stopping
 #' @param patience              patience for earlyStopper
@@ -247,7 +246,6 @@ gridCvDeep <- function(mappedData,
       ParallelLogger::logInfo(paste0("Fold ", i))
       trainDataset <- torch::dataset_subset(dataset, indices = which(fold != i))
       testDataset <- torch::dataset_subset(dataset, indices = which(fold == i))
-      # fitParams$posWeight <- trainDataset$dataset$posWeight
       estimator <- Estimator$new(
         baseModel = baseModel,
         modelParameters = modelParams,
@@ -301,7 +299,6 @@ gridCvDeep <- function(mappedData,
   fitParams <- finalParam[fitParamNames]
   fitParams$epochs <- finalParam$learnSchedule$bestEpoch
   fitParams$batchSize <- batchSize
-  fitParams$posWeight <- dataset$posWeight
   # create the dir
   if (!dir.exists(file.path(modelLocation))) {
     dir.create(file.path(modelLocation), recursive = T)
@@ -388,7 +385,6 @@ Estimator <- R6::R6Class(
       self$learningRate <- self$itemOrDefaults(fitParameters, "learningRate", 1e-3)
       self$l2Norm <- self$itemOrDefaults(fitParameters, "weightDecay", 1e-5)
       self$batchSize <- self$itemOrDefaults(fitParameters, "batchSize", 1024)
-      self$posWeight <- self$itemOrDefaults(fitParameters, "posWeight", 1)
       self$prefix <- self$itemOrDefaults(fitParameters, "prefix", self$model$name)
 
       self$previousEpochs <- self$itemOrDefaults(fitParameters, "previousEpochs", 0)
@@ -399,9 +395,7 @@ Estimator <- R6::R6Class(
         lr = self$learningRate,
         weight_decay = self$l2Norm
       )
-      self$criterion <- criterion(torch::torch_tensor(self$posWeight,
-        device = self$device
-      ))
+      self$criterion <- criterion()
 
       self$scheduler <- scheduler(self$optimizer,
         patience = 1,
@@ -615,14 +609,20 @@ Estimator <- R6::R6Class(
       batchIndex <- 1:length(dataset)
       batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
       torch::with_no_grad({
-        predictions <- c()
+        predictions <- torch::torch_empty(length(dataset), device=self$device)
         self$model$eval()
+        progressBar <- utils::txtProgressBar(style = 3)
+        ix <- 1
         coro::loop(for (b in batchIndex) {
           batch <- self$batchToDevice(dataset[b])
           target <- batch$target
           pred <- self$model(batch$batch)
-          predictions <- c(predictions, as.array(torch::torch_sigmoid(pred$cpu())))
+          predictions[b] <- torch::torch_sigmoid(pred)
+          utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
+          ix <- ix + 1
         })
+        predictions <- as.array(predictions$cpu())
+        close(progressBar)
       })
       return(predictions)
     },

From 17c630a71d686f5660b65b1a6a0617cf5d2f195b Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Sun, 9 Oct 2022 16:18:30 +0200
Subject: [PATCH 07/13] prepare for release

---
 DESCRIPTION                  | 2 +-
 man/setEstimator.Rd          | 2 --
 man/setTransformer.Rd        | 4 ++--
 tests/testthat/test-ResNet.R | 2 +-
 4 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b135257..cdb11c6 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -23,7 +23,7 @@ Imports:
     data.table,
     FeatureExtraction (>= 3.0.0),
     ParallelLogger (>= 2.0.0),
-    PatientLevelPrediction, (>= 6.0.4)
+    PatientLevelPrediction (>= 6.0.4),
     rlang,
     torch (>= 0.8.0)
 Suggests:
diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd
index d3001e3..50919af 100644
--- a/man/setEstimator.Rd
+++ b/man/setEstimator.Rd
@@ -14,8 +14,6 @@
 
 \item{criterion}{loss function to use}
 
-\item{posWeight}{If more weight should be added to positive labels during training - will result in miscalibrated models}
-
 \item{earlyStopping}{If earlyStopping should be used which stops the training of your metric is not improving}
 
 \item{earlyStoppingMetric}{Which parameter to use for early stopping}
diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd
index 9ef10ea..f46a66a 100644
--- a/man/setTransformer.Rd
+++ b/man/setTransformer.Rd
@@ -52,9 +52,9 @@ setTransformer(
 
 \item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'}
 
-\item{seed}{Random seed to use}
-
 \item{randomSample}{How many samples to use in hyperparameter search if random}
+
+\item{seed}{Random seed to use}
 }
 \description{
 A transformer model
diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R
index cb9a397..6fb0f91 100644
--- a/tests/testthat/test-ResNet.R
+++ b/tests/testthat/test-ResNet.R
@@ -87,7 +87,7 @@ test_that("ResNet nn-module works ", {
   pars <- sum(sapply(model$parameters, function(x) prod(x$shape)))
 
   # expected number of parameters
-  expect_equal(pars, 1289)
+  expect_equal(pars, 1295)
 
   input <- list()
   input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long())

From c0c3d3029518808cd584796adcee33704812c82b Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Sun, 9 Oct 2022 17:59:51 +0200
Subject: [PATCH 08/13] website action only on main branch

---
 .github/workflows/pkgdown.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
index 7a5e8ac..d22bb32 100644
--- a/.github/workflows/pkgdown.yaml
+++ b/.github/workflows/pkgdown.yaml
@@ -2,7 +2,7 @@
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 on:
   push:
-    branches: [main, develop]
+    branches: [main]
   release:
     types: [published]
   workflow_dispatch:

From 7d21ed7931a6354a3f2cc37a2e9a38ff60080226 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Sun, 9 Oct 2022 18:51:58 +0200
Subject: [PATCH 09/13] move Estimator class to a separate file

---
 R/Estimator-class.R  | 403 +++++++++++++++++++++++++++++++++++++++++++
 R/Estimator.R        | 388 +----------------------------------------
 man/EarlyStopping.Rd |   2 +-
 man/Estimator.Rd     |   2 +-
 4 files changed, 406 insertions(+), 389 deletions(-)
 create mode 100644 R/Estimator-class.R

diff --git a/R/Estimator-class.R b/R/Estimator-class.R
new file mode 100644
index 0000000..33980e1
--- /dev/null
+++ b/R/Estimator-class.R
@@ -0,0 +1,403 @@
+# @file Estimator-class.R
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Estimator
+#' @description
+#' A generic R6 class that wraps around a torch nn module and can be used to
+#' fit and predict the model defined in that module.
+#' @export
+Estimator <- R6::R6Class(
+  classname = "Estimator",
+  lock_objects = FALSE,
+  public = list(
+    #' @description
+    #' Creates a new estimator
+    #' @param baseModel       The torch nn module to use as model
+    #' @param modelParameters Parameters to initialize the baseModel
+    #' @param fitParameters   Parameters required for the estimator fitting
+    #' @param optimizer       A torch optimizer to use, default is Adam
+    #' @param criterion       The torch loss function to use, defaults to
+    #'                        binary cross entropy with logits
+    #' @param scheduler       learning rate scheduler to use
+    #' @param device           Which device to use for fitting, default is cpu
+    #' @param patience         Patience to use for early stopping
+    initialize = function(baseModel,
+                          modelParameters,
+                          fitParameters,
+                          optimizer = torch::optim_adam,
+                          criterion = torch::nn_bce_with_logits_loss,
+                          scheduler = torch::lr_reduce_on_plateau,
+                          device = "cpu",
+                          patience = 4) {
+      self$device <- device
+      self$model <- do.call(baseModel, modelParameters)
+      self$modelParameters <- modelParameters
+      self$fitParameters <- fitParameters
+      self$epochs <- self$itemOrDefaults(fitParameters, "epochs", 10)
+      self$learningRate <- self$itemOrDefaults(fitParameters, "learningRate", 1e-3)
+      self$l2Norm <- self$itemOrDefaults(fitParameters, "weightDecay", 1e-5)
+      self$batchSize <- self$itemOrDefaults(fitParameters, "batchSize", 1024)
+      self$prefix <- self$itemOrDefaults(fitParameters, "prefix", self$model$name)
+      
+      self$previousEpochs <- self$itemOrDefaults(fitParameters, "previousEpochs", 0)
+      self$model$to(device = self$device)
+      
+      self$optimizer <- optimizer(
+        params = self$model$parameters,
+        lr = self$learningRate,
+        weight_decay = self$l2Norm
+      )
+      self$criterion <- criterion()
+      
+      self$scheduler <- scheduler(self$optimizer,
+                                  patience = 1,
+                                  verbose = FALSE, mode = "max"
+      )
+      
+      # gradient accumulation is useful when training large numbers where
+      # you can only fit few samples on the GPU in each batch.
+      self$gradAccumulationIter <- 1
+      
+      if (!is.null(patience)) {
+        self$earlyStopper <- EarlyStopping$new(patience = patience)
+      } else {
+        self$earlyStopper <- NULL
+      }
+      
+      self$bestScore <- NULL
+      self$bestEpoch <- NULL
+    },
+    
+    #' @description fits the estimator
+    #' @param dataset     a torch dataset to use for model fitting
+    #' @param testDataset a torch dataset to use for early stopping
+    fit = function(dataset, testDataset) {
+      valLosses <- c()
+      valAUCs <- c()
+      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
+      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
+      
+      testBatchIndex <- 1:length(testDataset)
+      testBatchIndex <- split(testBatchIndex, ceiling(seq_along(testBatchIndex) / self$batchSize))
+      
+      modelStateDict <- list()
+      epoch <- list()
+      times <- list()
+      learnRates <- list()
+      for (epochI in 1:self$epochs) {
+        startTime <- Sys.time()
+        trainLoss <- self$fitEpoch(dataset, batchIndex)
+        endTime <- Sys.time()
+        
+        # predict on test data
+        scores <- self$score(testDataset, testBatchIndex)
+        delta <- endTime - startTime
+        currentEpoch <- epochI + self$previousEpochs
+        lr <- self$optimizer$param_groups[[1]]$lr
+        ParallelLogger::logInfo(
+          "Epochs: ", currentEpoch,
+          " | Val AUC: ", round(scores$auc, 3),
+          " | Val Loss: ", round(scores$loss, 3),
+          " | Train Loss: ", round(trainLoss, 3),
+          " | Time: ", round(delta, 3), " ",
+          units(delta),
+          " | LR: ", lr
+        )
+        self$scheduler$step(scores$auc)
+        valLosses <- c(valLosses, scores$loss)
+        valAUCs <- c(valAUCs, scores$auc)
+        learnRates <- c(learnRates, lr)
+        times <- c(times, round(delta, 3))
+        if (!is.null(self$earlyStopper)) {
+          self$earlyStopper$call(scores$auc)
+          if (self$earlyStopper$improved) {
+            # here it saves the results to lists rather than files
+            modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
+            epoch[[epochI]] <- currentEpoch
+          }
+          if (self$earlyStopper$earlyStop) {
+            ParallelLogger::logInfo("Early stopping, validation AUC stopped improving")
+            ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
+            self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)
+            return(invisible(self))
+          }
+        } else {
+          modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
+          epoch[[epochI]] <- currentEpoch
+        }
+      }
+      ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
+      self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)
+      invisible(self)
+    },
+    
+    #' @description
+    #' fits estimator for one epoch (one round through the data)
+    #' @param dataset     torch dataset to use for fitting
+    #' @param batchIndex  indices of batches
+    fitEpoch = function(dataset, batchIndex) {
+      trainLosses <- torch::torch_empty(length(batchIndex))
+      ix <- 1
+      self$model$train()
+      progressBar <- utils::txtProgressBar(style = 3)
+      coro::loop(for (b in batchIndex) {
+        self$optimizer$zero_grad()
+        batch <- self$batchToDevice(dataset[b])
+        out <- self$model(batch[[1]])
+        loss <- self$criterion(out, batch[[2]])
+        loss$backward()
+        
+        self$optimizer$step()
+        trainLosses[ix] <- loss$detach()
+        utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
+        ix <- ix + 1
+      })
+      close(progressBar)
+      trainLosses$mean()$item()
+    },
+    
+    #' @description
+    #' calculates loss and auc after training for one epoch
+    #' @param dataset    The torch dataset to use to evaluate loss and auc
+    #' @param batchIndex Indices of batches in the dataset
+    #' @return list with average loss and auc in the dataset
+    score = function(dataset, batchIndex) {
+      torch::with_no_grad({
+        loss <- torch::torch_empty(c(length(batchIndex)))
+        predictions <- list()
+        targets <- list()
+        self$model$eval()
+        ix <- 1
+        coro::loop(for (b in batchIndex) {
+          batch <- self$batchToDevice(dataset[b])
+          pred <- self$model(batch[[1]])
+          predictions <- c(predictions, pred)
+          targets <- c(targets, batch[[2]])
+          loss[ix] <- self$criterion(pred, batch[[2]])
+          ix <- ix + 1
+        })
+        mean_loss <- loss$mean()$item()
+        predictionsClass <- data.frame(
+          value = as.matrix(torch::torch_sigmoid(torch::torch_cat(predictions)$cpu())),
+          outcomeCount = as.matrix(torch::torch_cat(targets)$cpu())
+        )
+        attr(predictionsClass, "metaData")$modelType <- "binary"
+        auc <- PatientLevelPrediction::computeAuc(predictionsClass)
+      })
+      return(list(loss = mean_loss, auc = auc))
+    },
+    
+    #' @description
+    #' operations that run when fitting is finished
+    #' @param valAUCs         validation AUC values
+    #' @param modelStateDict  fitted model parameters
+    #' @param valLosses       validation losses
+    #' @param epoch           list of epochs fit
+    #' @param learnRates      learning rate sequence used so far
+    finishFit = function(valAUCs, modelStateDict, valLosses, epoch, learnRates) {
+      bestEpochInd <- which.max(valAUCs) # change this if a different metric is used
+      
+      bestModelStateDict <- lapply(modelStateDict[[bestEpochInd]], function(x) x$to(device = self$device))
+      self$model$load_state_dict(bestModelStateDict)
+      
+      bestEpoch <- epoch[[bestEpochInd]]
+      self$bestEpoch <- bestEpoch
+      self$bestScore <- list(
+        loss = valLosses[bestEpochInd],
+        auc = valAUCs[bestEpochInd]
+      )
+      self$learnRateSchedule <- learnRates[1:bestEpochInd]
+      
+      ParallelLogger::logInfo("Loaded best model (based on AUC) from epoch ", bestEpoch)
+      ParallelLogger::logInfo("ValLoss: ", self$bestScore$loss)
+      ParallelLogger::logInfo("valAUC: ", self$bestScore$auc)
+    },
+    
+    #' @description
+    #' Fits whole training set on a specific number of epochs
+    #' TODO What happens when learning rate changes per epochs?
+    #' Ideally I would copy the learning rate strategy from before
+    #' and adjust for different sizes ie more iterations/updates???
+    #' @param dataset torch dataset
+    #' @param learnRates learnRateSchedule from CV
+    fitWholeTrainingSet = function(dataset, learnRates = NULL) {
+      if (is.null(self$bestEpoch)) {
+        self$bestEpoch <- self$epochs
+      }
+      
+      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
+      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
+      for (epoch in 1:self$bestEpoch) {
+        self$optimizer$param_groups[[1]]$lr <- learnRates[[epoch]]
+        self$fitEpoch(dataset, batchIndex)
+      }
+    },
+    
+    #' @description
+    #' save model and those parameters needed to reconstruct it
+    #' @param path where to save the model
+    #' @param name name of file
+    #' @return the path to saved model
+    save = function(path, name) {
+      savePath <- file.path(path, name)
+      torch::torch_save(
+        list(
+          modelStateDict = self$model$state_dict(),
+          modelParameters = self$modelParameters,
+          fitParameters = self$fitParameters,
+          epoch = self$epochs
+        ),
+        savePath
+      )
+      return(savePath)
+    },
+    
+    
+    #' @description
+    #' predicts and outputs the probabilities
+    #' @param dataset Torch dataset to create predictions for
+    #' @return predictions as probabilities
+    predictProba = function(dataset) {
+      batchIndex <- 1:length(dataset)
+      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
+      torch::with_no_grad({
+        predictions <- torch::torch_empty(length(dataset), device=self$device)
+        self$model$eval()
+        progressBar <- utils::txtProgressBar(style = 3)
+        ix <- 1
+        coro::loop(for (b in batchIndex) {
+          batch <- self$batchToDevice(dataset[b])
+          target <- batch$target
+          pred <- self$model(batch$batch)
+          predictions[b] <- torch::torch_sigmoid(pred)
+          utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
+          ix <- ix + 1
+        })
+        predictions <- as.array(predictions$cpu())
+        close(progressBar)
+      })
+      return(predictions)
+    },
+    
+    #' @description
+    #' predicts and outputs the class
+    #' @param   dataset A torch dataset to create predictions for
+    #' @param   threshold Which threshold to use for predictions
+    #' @return  The predicted class for the data in the dataset
+    predict = function(dataset, threshold = NULL) {
+      predictions <- self$predictProba(dataset)
+      
+      if (is.null(threshold)) {
+        # use outcome rate
+        threshold <- dataset$target$sum()$item() / length(dataset)
+      }
+      predicted_class <- as.integer(predictions > threshold)
+      return(predicted_class)
+    },
+    
+    #' @description
+    #' sends a batch of data to device
+    #' assumes batch includes lists of tensors to arbitrary nested depths
+    #' @param batch the batch to send, usually a list of torch tensors
+    #' @return the batch on the required device
+    batchToDevice = function(batch) {
+      if (class(batch)[1] == "torch_tensor") {
+        batch <- batch$to(device = self$device)
+      } else {
+        ix <- 1
+        for (b in batch) {
+          if (class(b)[1] == "torch_tensor") {
+            b <- b$to(device = self$device)
+          } else {
+            b <- self$batchToDevice(b)
+          }
+          if (!is.null(b)) {
+            batch[[ix]] <- b
+          }
+          ix <- ix + 1
+        }
+      }
+      return(batch)
+    },
+    
+    #' @description
+    #' select item from list, and if it's null sets a default
+    #' @param list A list with items
+    #' @param item Which list item to retrieve
+    #' @param default The value to return if list doesn't have item
+    #' @return the list item or default
+    itemOrDefaults = function(list, item, default = NULL) {
+      value <- list[[item]]
+      if (is.null(value)) default else value
+    }
+  )
+)
+
+#' Earlystopping class
+#' @description
+#' Stops training if a loss or metric has stopped improving
+EarlyStopping <- R6::R6Class(
+  classname = "EarlyStopping",
+  lock_objects = FALSE,
+  public = list(
+    #' @description
+    #' Creates a new earlystopping object
+    #' @param patience Stop after this number of epochs if loss doesn't improve
+    #' @param delta    How much does the loss need to improve to count as improvement
+    #' @param verbose  If information should be printed out
+    #' @return a new earlystopping object
+    initialize = function(patience = 3, delta = 0, verbose = TRUE) {
+      self$patience <- patience
+      self$counter <- 0
+      self$verbose <- verbose
+      self$bestScore <- NULL
+      self$earlyStop <- FALSE
+      self$improved <- FALSE
+      self$delta <- delta
+      self$previousScore <- 0
+    },
+    #' @description
+    #' call the earlystopping object and increment a counter if loss is not
+    #' improving
+    #' @param metric the current metric value
+    call = function(metric) {
+      score <- metric
+      if (is.null(self$bestScore)) {
+        self$bestScore <- score
+        self$improved <- TRUE
+      } else if (score < self$bestScore + self$delta) {
+        self$counter <- self$counter + 1
+        self$improved <- FALSE
+        if (self$verbose) {
+          ParallelLogger::logInfo(
+            "EarlyStopping counter: ", self$counter,
+            " out of ", self$patience
+          )
+        }
+        if (self$counter >= self$patience) {
+          self$earlyStop <- TRUE
+        }
+      } else {
+        self$bestScore <- score
+        self$counter <- 0
+        self$improved <- TRUE
+      }
+      self$previousScore <- score
+    }
+  )
+)
diff --git a/R/Estimator.R b/R/Estimator.R
index 0ad632b..00d78c5 100644
--- a/R/Estimator.R
+++ b/R/Estimator.R
@@ -347,390 +347,4 @@ gridCvDeep <- function(mappedData,
       numericalIndex = numericalIndex
     )
   )
-}
-
-#' Estimator
-#' @description
-#' A generic R6 class that wraps around a torch nn module and can be used to
-#' fit and predict the model defined in that module.
-#' @export
-Estimator <- R6::R6Class(
-  classname = "Estimator",
-  lock_objects = FALSE,
-  public = list(
-    #' @description
-    #' Creates a new estimator
-    #' @param baseModel       The torch nn module to use as model
-    #' @param modelParameters Parameters to initialize the baseModel
-    #' @param fitParameters   Parameters required for the estimator fitting
-    #' @param optimizer       A torch optimizer to use, default is Adam
-    #' @param criterion       The torch loss function to use, defaults to
-    #'                        binary cross entropy with logits
-    #' @param scheduler       learning rate scheduler to use
-    #' @param device           Which device to use for fitting, default is cpu
-    #' @param patience         Patience to use for early stopping
-    initialize = function(baseModel,
-                          modelParameters,
-                          fitParameters,
-                          optimizer = torch::optim_adam,
-                          criterion = torch::nn_bce_with_logits_loss,
-                          scheduler = torch::lr_reduce_on_plateau,
-                          device = "cpu",
-                          patience = 4) {
-      self$device <- device
-      self$model <- do.call(baseModel, modelParameters)
-      self$modelParameters <- modelParameters
-      self$fitParameters <- fitParameters
-      self$epochs <- self$itemOrDefaults(fitParameters, "epochs", 10)
-      self$learningRate <- self$itemOrDefaults(fitParameters, "learningRate", 1e-3)
-      self$l2Norm <- self$itemOrDefaults(fitParameters, "weightDecay", 1e-5)
-      self$batchSize <- self$itemOrDefaults(fitParameters, "batchSize", 1024)
-      self$prefix <- self$itemOrDefaults(fitParameters, "prefix", self$model$name)
-
-      self$previousEpochs <- self$itemOrDefaults(fitParameters, "previousEpochs", 0)
-      self$model$to(device = self$device)
-
-      self$optimizer <- optimizer(
-        params = self$model$parameters,
-        lr = self$learningRate,
-        weight_decay = self$l2Norm
-      )
-      self$criterion <- criterion()
-
-      self$scheduler <- scheduler(self$optimizer,
-        patience = 1,
-        verbose = FALSE, mode = "max"
-      )
-
-      # gradient accumulation is useful when training large numbers where
-      # you can only fit few samples on the GPU in each batch.
-      self$gradAccumulationIter <- 1
-
-      if (!is.null(patience)) {
-        self$earlyStopper <- EarlyStopping$new(patience = patience)
-      } else {
-        self$earlyStopper <- NULL
-      }
-
-      self$bestScore <- NULL
-      self$bestEpoch <- NULL
-    },
-
-    #' @description fits the estimator
-    #' @param dataset     a torch dataset to use for model fitting
-    #' @param testDataset a torch dataset to use for early stopping
-    fit = function(dataset, testDataset) {
-      valLosses <- c()
-      valAUCs <- c()
-      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
-      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
-
-      testBatchIndex <- 1:length(testDataset)
-      testBatchIndex <- split(testBatchIndex, ceiling(seq_along(testBatchIndex) / self$batchSize))
-
-      modelStateDict <- list()
-      epoch <- list()
-      times <- list()
-      learnRates <- list()
-      for (epochI in 1:self$epochs) {
-        startTime <- Sys.time()
-        trainLoss <- self$fitEpoch(dataset, batchIndex)
-        endTime <- Sys.time()
-
-        # predict on test data
-        scores <- self$score(testDataset, testBatchIndex)
-        delta <- endTime - startTime
-        currentEpoch <- epochI + self$previousEpochs
-        lr <- self$optimizer$param_groups[[1]]$lr
-        ParallelLogger::logInfo(
-          "Epochs: ", currentEpoch,
-          " | Val AUC: ", round(scores$auc, 3),
-          " | Val Loss: ", round(scores$loss, 3),
-          " | Train Loss: ", round(trainLoss, 3),
-          " | Time: ", round(delta, 3), " ",
-          units(delta),
-          " | LR: ", lr
-        )
-        self$scheduler$step(scores$auc)
-        valLosses <- c(valLosses, scores$loss)
-        valAUCs <- c(valAUCs, scores$auc)
-        learnRates <- c(learnRates, lr)
-        times <- c(times, round(delta, 3))
-        if (!is.null(self$earlyStopper)) {
-          self$earlyStopper$call(scores$auc)
-          if (self$earlyStopper$improved) {
-            # here it saves the results to lists rather than files
-            modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
-            epoch[[epochI]] <- currentEpoch
-          }
-          if (self$earlyStopper$earlyStop) {
-            ParallelLogger::logInfo("Early stopping, validation AUC stopped improving")
-            ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
-            self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)
-            return(invisible(self))
-          }
-        } else {
-          modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
-          epoch[[epochI]] <- currentEpoch
-        }
-      }
-      ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
-      self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)
-      invisible(self)
-    },
-
-    #' @description
-    #' fits estimator for one epoch (one round through the data)
-    #' @param dataset     torch dataset to use for fitting
-    #' @param batchIndex  indices of batches
-    fitEpoch = function(dataset, batchIndex) {
-      trainLosses <- torch::torch_empty(length(batchIndex))
-      ix <- 1
-      self$model$train()
-      progressBar <- utils::txtProgressBar(style = 3)
-      coro::loop(for (b in batchIndex) {
-        self$optimizer$zero_grad()
-        batch <- self$batchToDevice(dataset[b])
-        out <- self$model(batch[[1]])
-        loss <- self$criterion(out, batch[[2]])
-        loss$backward()
-
-        self$optimizer$step()
-        trainLosses[ix] <- loss$detach()
-        utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
-        ix <- ix + 1
-      })
-      close(progressBar)
-      trainLosses$mean()$item()
-    },
-
-    #' @description
-    #' calculates loss and auc after training for one epoch
-    #' @param dataset    The torch dataset to use to evaluate loss and auc
-    #' @param batchIndex Indices of batches in the dataset
-    #' @return list with average loss and auc in the dataset
-    score = function(dataset, batchIndex) {
-      torch::with_no_grad({
-        loss <- torch::torch_empty(c(length(batchIndex)))
-        predictions <- list()
-        targets <- list()
-        self$model$eval()
-        ix <- 1
-        coro::loop(for (b in batchIndex) {
-          batch <- self$batchToDevice(dataset[b])
-          pred <- self$model(batch[[1]])
-          predictions <- c(predictions, pred)
-          targets <- c(targets, batch[[2]])
-          loss[ix] <- self$criterion(pred, batch[[2]])
-          ix <- ix + 1
-        })
-        mean_loss <- loss$mean()$item()
-        predictionsClass <- data.frame(
-          value = as.matrix(torch::torch_sigmoid(torch::torch_cat(predictions)$cpu())),
-          outcomeCount = as.matrix(torch::torch_cat(targets)$cpu())
-        )
-        attr(predictionsClass, "metaData")$modelType <- "binary"
-        auc <- PatientLevelPrediction::computeAuc(predictionsClass)
-      })
-      return(list(loss = mean_loss, auc = auc))
-    },
-
-    #' @description
-    #' operations that run when fitting is finished
-    #' @param valAUCs         validation AUC values
-    #' @param modelStateDict  fitted model parameters
-    #' @param valLosses       validation losses
-    #' @param epoch           list of epochs fit
-    #' @param learnRates      learning rate sequence used so far
-    finishFit = function(valAUCs, modelStateDict, valLosses, epoch, learnRates) {
-      bestEpochInd <- which.max(valAUCs) # change this if a different metric is used
-
-      bestModelStateDict <- lapply(modelStateDict[[bestEpochInd]], function(x) x$to(device = self$device))
-      self$model$load_state_dict(bestModelStateDict)
-
-      bestEpoch <- epoch[[bestEpochInd]]
-      self$bestEpoch <- bestEpoch
-      self$bestScore <- list(
-        loss = valLosses[bestEpochInd],
-        auc = valAUCs[bestEpochInd]
-      )
-      self$learnRateSchedule <- learnRates[1:bestEpochInd]
-
-      ParallelLogger::logInfo("Loaded best model (based on AUC) from epoch ", bestEpoch)
-      ParallelLogger::logInfo("ValLoss: ", self$bestScore$loss)
-      ParallelLogger::logInfo("valAUC: ", self$bestScore$auc)
-    },
-
-    #' @description
-    #' Fits whole training set on a specific number of epochs
-    #' TODO What happens when learning rate changes per epochs?
-    #' Ideally I would copy the learning rate strategy from before
-    #' and adjust for different sizes ie more iterations/updates???
-    #' @param dataset torch dataset
-    #' @param learnRates learnRateSchedule from CV
-    fitWholeTrainingSet = function(dataset, learnRates = NULL) {
-      if (is.null(self$bestEpoch)) {
-        self$bestEpoch <- self$epochs
-      }
-
-      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
-      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
-      for (epoch in 1:self$bestEpoch) {
-        self$optimizer$param_groups[[1]]$lr <- learnRates[[epoch]]
-        self$fitEpoch(dataset, batchIndex)
-      }
-    },
-
-    #' @description
-    #' save model and those parameters needed to reconstruct it
-    #' @param path where to save the model
-    #' @param name name of file
-    #' @return the path to saved model
-    save = function(path, name) {
-      savePath <- file.path(path, name)
-      torch::torch_save(
-        list(
-          modelStateDict = self$model$state_dict(),
-          modelParameters = self$modelParameters,
-          fitParameters = self$fitParameters,
-          epoch = self$epochs
-        ),
-        savePath
-      )
-      return(savePath)
-    },
-
-
-    #' @description
-    #' predicts and outputs the probabilities
-    #' @param dataset Torch dataset to create predictions for
-    #' @return predictions as probabilities
-    predictProba = function(dataset) {
-      batchIndex <- 1:length(dataset)
-      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
-      torch::with_no_grad({
-        predictions <- torch::torch_empty(length(dataset), device=self$device)
-        self$model$eval()
-        progressBar <- utils::txtProgressBar(style = 3)
-        ix <- 1
-        coro::loop(for (b in batchIndex) {
-          batch <- self$batchToDevice(dataset[b])
-          target <- batch$target
-          pred <- self$model(batch$batch)
-          predictions[b] <- torch::torch_sigmoid(pred)
-          utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
-          ix <- ix + 1
-        })
-        predictions <- as.array(predictions$cpu())
-        close(progressBar)
-      })
-      return(predictions)
-    },
-
-    #' @description
-    #' predicts and outputs the class
-    #' @param   dataset A torch dataset to create predictions for
-    #' @param   threshold Which threshold to use for predictions
-    #' @return  The predicted class for the data in the dataset
-    predict = function(dataset, threshold = NULL) {
-      predictions <- self$predictProba(dataset)
-
-      if (is.null(threshold)) {
-        # use outcome rate
-        threshold <- dataset$target$sum()$item() / length(dataset)
-      }
-      predicted_class <- as.integer(predictions > threshold)
-      return(predicted_class)
-    },
-
-    #' @description
-    #' sends a batch of data to device
-    #' assumes batch includes lists of tensors to arbitrary nested depths
-    #' @param batch the batch to send, usually a list of torch tensors
-    #' @return the batch on the required device
-    batchToDevice = function(batch) {
-      if (class(batch)[1] == "torch_tensor") {
-        batch <- batch$to(device = self$device)
-      } else {
-        ix <- 1
-        for (b in batch) {
-          if (class(b)[1] == "torch_tensor") {
-            b <- b$to(device = self$device)
-          } else {
-            b <- self$batchToDevice(b)
-          }
-          if (!is.null(b)) {
-            batch[[ix]] <- b
-          }
-          ix <- ix + 1
-        }
-      }
-      return(batch)
-    },
-
-    #' @description
-    #' select item from list, and if it's null sets a default
-    #' @param list A list with items
-    #' @param item Which list item to retrieve
-    #' @param default The value to return if list doesn't have item
-    #' @return the list item or default
-    itemOrDefaults = function(list, item, default = NULL) {
-      value <- list[[item]]
-      if (is.null(value)) default else value
-    }
-  )
-)
-
-#' Earlystopping class
-#' @description
-#' Stops training if a loss or metric has stopped improving
-EarlyStopping <- R6::R6Class(
-  classname = "EarlyStopping",
-  lock_objects = FALSE,
-  public = list(
-    #' @description
-    #' Creates a new earlystopping object
-    #' @param patience Stop after this number of epochs if loss doesn't improve
-    #' @param delta    How much does the loss need to improve to count as improvement
-    #' @param verbose  If information should be printed out
-    #' @return a new earlystopping object
-    initialize = function(patience = 3, delta = 0, verbose = TRUE) {
-      self$patience <- patience
-      self$counter <- 0
-      self$verbose <- verbose
-      self$bestScore <- NULL
-      self$earlyStop <- FALSE
-      self$improved <- FALSE
-      self$delta <- delta
-      self$previousScore <- 0
-    },
-    #' @description
-    #' call the earlystopping object and increment a counter if loss is not
-    #' improving
-    #' @param metric the current metric value
-    call = function(metric) {
-      score <- metric
-      if (is.null(self$bestScore)) {
-        self$bestScore <- score
-        self$improved <- TRUE
-      } else if (score < self$bestScore + self$delta) {
-        self$counter <- self$counter + 1
-        self$improved <- FALSE
-        if (self$verbose) {
-          ParallelLogger::logInfo(
-            "EarlyStopping counter: ", self$counter,
-            " out of ", self$patience
-          )
-        }
-        if (self$counter >= self$patience) {
-          self$earlyStop <- TRUE
-        }
-      } else {
-        self$bestScore <- score
-        self$counter <- 0
-        self$improved <- TRUE
-      }
-      self$previousScore <- score
-    }
-  )
-)
+}
\ No newline at end of file
diff --git a/man/EarlyStopping.Rd b/man/EarlyStopping.Rd
index 1087705..0baf50a 100644
--- a/man/EarlyStopping.Rd
+++ b/man/EarlyStopping.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Estimator.R
+% Please edit documentation in R/Estimator-class.R
 \name{EarlyStopping}
 \alias{EarlyStopping}
 \title{Earlystopping class}
diff --git a/man/Estimator.Rd b/man/Estimator.Rd
index a6fc4bc..e825cf9 100644
--- a/man/Estimator.Rd
+++ b/man/Estimator.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Estimator.R
+% Please edit documentation in R/Estimator-class.R
 \name{Estimator}
 \alias{Estimator}
 \title{Estimator}

From 5356946305d01b3563ea42232e382d064b1a2358 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Sun, 9 Oct 2022 18:52:13 +0200
Subject: [PATCH 10/13] add a defaultResNet with tests

---
 NAMESPACE                    |  1 +
 R/ResNet.R                   | 40 +++++++++++++++++++++++++++++++++++-
 man/setDefaultResNet.Rd      | 24 ++++++++++++++++++++++
 man/setResNet.Rd             |  2 +-
 tests/testthat/test-ResNet.R | 13 ++++++++++++
 5 files changed, 78 insertions(+), 2 deletions(-)
 create mode 100644 man/setDefaultResNet.Rd

diff --git a/NAMESPACE b/NAMESPACE
index a102062..dd5fbf9 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,6 +5,7 @@ export(Estimator)
 export(fitEstimator)
 export(gridCvDeep)
 export(predictDeepEstimator)
+export(setDefaultResNet)
 export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
diff --git a/R/ResNet.R b/R/ResNet.R
index ac9200f..d0364a4 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -16,6 +16,44 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#' setDefaultResNet
+#'
+#' @description
+#' Creates settings for a default ResNet model
+#'
+#' @details
+#' Model architecture from by https://arxiv.org/abs/2106.11959 . 
+#' Hyperparameters chosen by a experience on a few prediction problems.
+#'
+#' @param device            Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'
+#' @param batchSize         Size of batch, default: 1024
+#' @param epochs            Number of epochs to run, default: 10
+#' @param seed              Random seed to use
+
+#' @export
+setDefaultResNet <- function(device='cpu',
+                             batchSize=1024,
+                             epochs=10,
+                             seed=NULL) {
+  
+  resnetSettings <- setResNet(numLayers = 6,
+                              sizeHidden = 512,
+                              hiddenFactor = 2,
+                              residualDropout = 0.1,
+                              hiddenDropout = 0.4,
+                              sizeEmbedding = 256,
+                              weightDecay = 1e-6,
+                              learningRate = 0.01,
+                              hyperParamSearch = 'random',
+                              randomSample = 1,
+                              device = device,
+                              batchSize = batchSize,
+                              seed = seed)
+  attr(resnetSettings, 'settings')$name <- 'defaultResnet'
+  return(resnetSettings)
+}
+
+
 #' setResNet
 #'
 #' @description
@@ -42,7 +80,7 @@
 #'
 #' @export
 setResNet <- function(numLayers = c(1:8),
-                      sizeHidden = c(2^(6:9)),
+                      sizeHidden = c(2^(6:10)),
                       hiddenFactor = c(1:4),
                       residualDropout = c(seq(0, 0.5, 0.05)),
                       hiddenDropout = c(seq(0, 0.5, 0.05)),
diff --git a/man/setDefaultResNet.Rd b/man/setDefaultResNet.Rd
new file mode 100644
index 0000000..28c29df
--- /dev/null
+++ b/man/setDefaultResNet.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ResNet.R
+\name{setDefaultResNet}
+\alias{setDefaultResNet}
+\title{setDefaultResNet}
+\usage{
+setDefaultResNet(device = "cpu", batchSize = 1024, epochs = 10, seed = NULL)
+}
+\arguments{
+\item{device}{Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'}
+
+\item{batchSize}{Size of batch, default: 1024}
+
+\item{epochs}{Number of epochs to run, default: 10}
+
+\item{seed}{Random seed to use}
+}
+\description{
+Creates settings for a default ResNet model
+}
+\details{
+Model architecture from by https://arxiv.org/abs/2106.11959 . 
+Hyperparameters chosen by a experience on a few prediction problems.
+}
diff --git a/man/setResNet.Rd b/man/setResNet.Rd
index d0ced0d..5f66d45 100644
--- a/man/setResNet.Rd
+++ b/man/setResNet.Rd
@@ -6,7 +6,7 @@
 \usage{
 setResNet(
   numLayers = c(1:8),
-  sizeHidden = c(2^(6:9)),
+  sizeHidden = c(2^(6:10)),
   hiddenFactor = c(1:4),
   residualDropout = c(seq(0, 0.5, 0.05)),
   hiddenDropout = c(seq(0, 0.5, 0.05)),
diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R
index 6fb0f91..626d39d 100644
--- a/tests/testthat/test-ResNet.R
+++ b/tests/testthat/test-ResNet.R
@@ -111,3 +111,16 @@ test_that("ResNet nn-module works ", {
   # model works without numeric variables
   expect_equal(output$shape, 10)
 })
+
+test_that("Default Resnet works", {
+  defaultResNet <- setDefaultResNet()
+  params <- defaultResNet$param[[1]]
+  
+  expect_equal(params$numLayers, 6)
+  expect_equal(params$sizeHidden, 512)
+  expect_equal(params$hiddenFactor, 2)
+  expect_equal(params$residualDropout, 0.1)
+  expect_equal(params$hiddenDropout, 0.4)
+  expect_equal(params$sizeEmbedding, 256)
+  
+}) 

From 599737e5ee7e0adb7875f0f6f52b340296d962a6 Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Sun, 9 Oct 2022 19:09:38 +0200
Subject: [PATCH 11/13] Add default transformer with tests

---
 NAMESPACE                         |  1 +
 R/Transformer.R                   | 53 +++++++++++++++++++++++++++++++
 man/setDefaultTransformer.Rd      | 29 +++++++++++++++++
 tests/testthat/test-Transformer.R | 15 +++++++++
 4 files changed, 98 insertions(+)
 create mode 100644 man/setDefaultTransformer.Rd

diff --git a/NAMESPACE b/NAMESPACE
index dd5fbf9..a65b67a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,6 +6,7 @@ export(fitEstimator)
 export(gridCvDeep)
 export(predictDeepEstimator)
 export(setDefaultResNet)
+export(setDefaultTransformer)
 export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
diff --git a/R/Transformer.R b/R/Transformer.R
index 15ce545..ea353ca 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -1,3 +1,56 @@
+# @file Transformer.R
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Create default settings for a non-temporal transformer
+#' 
+#' @description A transformer model with default hyperparameters
+#' @details from https://arxiv.org/abs/2106.11959
+#' Default hyperparameters from paper
+#' @param device   Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'  
+#' @param batchSize Size of batch, default: 512
+#' @param epochs  Number of epochs to run, default: 10
+#' @param seed    random seed to use
+#' 
+#' @export
+setDefaultTransformer <- function(device='cpu',
+                                  batchSize=512,
+                                  epochs=10,
+                                  seed=NULL) {
+  transformerSettings <- setTransformer(numBlocks = 3,
+                                        dimToken = 192,
+                                        dimOut = 1,
+                                        numHeads = 8,
+                                        attDropout = 0.2,
+                                        ffnDropout = 0.1,
+                                        resDropout = 0.0,
+                                        dimHidden = 256,
+                                        weightDecay = 1e-5,
+                                        learningRate = 1e-4,
+                                        batchSize = 512,
+                                        epochs = 30,
+                                        device = device,
+                                        hyperParamSearch = 'random',
+                                        randomSample = 1,
+                                        seed = seed)
+  attr(transformerSettings, 'settings')$name <- 'defaultTransformer'
+  return(transformerSettings)
+}
+
 #' create settings for training a non-temporal transformer
 #'
 #' @description A transformer model
diff --git a/man/setDefaultTransformer.Rd b/man/setDefaultTransformer.Rd
new file mode 100644
index 0000000..d3b54fb
--- /dev/null
+++ b/man/setDefaultTransformer.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Transformer.R
+\name{setDefaultTransformer}
+\alias{setDefaultTransformer}
+\title{Create default settings for a non-temporal transformer}
+\usage{
+setDefaultTransformer(
+  device = "cpu",
+  batchSize = 512,
+  epochs = 10,
+  seed = NULL
+)
+}
+\arguments{
+\item{device}{Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'}
+
+\item{batchSize}{Size of batch, default: 512}
+
+\item{epochs}{Number of epochs to run, default: 10}
+
+\item{seed}{random seed to use}
+}
+\description{
+A transformer model with default hyperparameters
+}
+\details{
+from https://arxiv.org/abs/2106.11959
+Default hyperparameters from paper
+}
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
index 60547de..b8e1e22 100644
--- a/tests/testthat/test-Transformer.R
+++ b/tests/testthat/test-Transformer.R
@@ -59,3 +59,18 @@ test_that("transformer nn-module works", {
   output <- model(input)
   expect_equal(output$shape, 10)
 })
+
+test_that("Default Transformer works", {
+  defaultTransformer <- setDefaultTransformer()
+  params <- defaultTransformer$param[[1]]
+  
+  expect_equal(params$numBlocks, 3)
+  expect_equal(params$dimToken, 192)
+  expect_equal(params$numHeads, 8)
+  expect_equal(params$resDropout, 0.0)
+  expect_equal(params$attDropout, 0.2)
+  
+  settings <- attr(defaultTransformer, 'settings')
+  
+  expect_equal(settings$name, 'defaultTransformer')
+}) 

From ee09f83dd312d456ad19de36895b0e486828b10d Mon Sep 17 00:00:00 2001
From: egillax <egillax@gmail.com>
Date: Mon, 10 Oct 2022 11:32:43 +0200
Subject: [PATCH 12/13] add tests for if random sample is higher than possible
 combinations

---
 tests/testthat/test-MLP.R         | 17 +++++++++++++++++
 tests/testthat/test-ResNet.R      | 17 +++++++++++++++++
 tests/testthat/test-Transformer.R |  6 ++++++
 3 files changed, 40 insertions(+)

diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MLP.R
index 3bc0d8f..97d7103 100644
--- a/tests/testthat/test-MLP.R
+++ b/tests/testthat/test-MLP.R
@@ -108,3 +108,20 @@ test_that("MLP nn-module works ", {
   # model works without numeric variables
   expect_equal(output$shape, 10)
 })
+
+
+test_that("Errors are produced by settings function", {
+  randomSample <- 2
+  
+  expect_error(setMultiLayerPerceptron(
+    numLayers = 1,
+    sizeHidden = 128,
+    dropout = 0.0,
+    sizeEmbedding = 128,
+    weightDecay = 1e-6,
+    learningRate = 0.01,
+    seed = 42,
+    hyperParamSearch = 'random',
+    randomSample = randomSample))
+                           
+})
\ No newline at end of file
diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R
index 626d39d..8f5a12f 100644
--- a/tests/testthat/test-ResNet.R
+++ b/tests/testthat/test-ResNet.R
@@ -124,3 +124,20 @@ test_that("Default Resnet works", {
   expect_equal(params$sizeEmbedding, 256)
   
 }) 
+
+test_that("Errors are produced by settings function", {
+  randomSample <- 2
+  
+  expect_error(setResNet(
+    numLayers = 1,
+    sizeHidden = 128,
+    hiddenFactor = 1,
+    residualDropout = 0.0,
+    hiddenDropout = 0.0,
+    sizeEmbedding = 128,
+    weightDecay = 1e-6,
+    learningRate = 0.01,
+    seed = 42,
+    hyperParamSearch = 'random',
+    randomSample = randomSample))
+})
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
index b8e1e22..228e881 100644
--- a/tests/testthat/test-Transformer.R
+++ b/tests/testthat/test-Transformer.R
@@ -74,3 +74,9 @@ test_that("Default Transformer works", {
   
   expect_equal(settings$name, 'defaultTransformer')
 }) 
+
+test_that("Errors are produced by settings function", {
+  randomSample <- 2
+  
+  expect_error(setTransformer(randomSample = randomSample))
+})

From 99ece529fe0c2e5204766e5b3cbfa2dea5a04494 Mon Sep 17 00:00:00 2001
From: egillax <egillax@gmail.com>
Date: Mon, 10 Oct 2022 11:50:17 +0200
Subject: [PATCH 13/13] update NEWS

---
 NEWS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NEWS.md b/NEWS.md
index c0bdb3a..08ed504 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,4 +3,5 @@ DeepPatientLevelPrediction 1.0.0
 
 - created an Estimator R6 class to handle the model fitting
 - Added three non-temporal models. An MLP, a ResNet and a Transformer
+- ResNet and Transformer have default versions of hyperparameters
 - Created tests and documentation for the package