diff --git a/.Rbuildignore b/.Rbuildignore
index 307ed11..9ba1d8b 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -8,4 +8,8 @@
 ^extras$
 ^deploy.sh$
 ^compare_versions$
-^.mypy_cache$
\ No newline at end of file
+^.mypy_cache$
+^inst/python/__pycache__
+^.*\.pt$
+^doc$
+^Meta$
diff --git a/.github/workflows/R_CDM_check_hades.yaml b/.github/workflows/R_CDM_check_hades.yaml
index a8e6186..fab1ec5 100644
--- a/.github/workflows/R_CDM_check_hades.yaml
+++ b/.github/workflows/R_CDM_check_hades.yaml
@@ -67,10 +67,11 @@ jobs:
           while read -r cmd
           do
             eval sudo $cmd
-          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
+          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "22.04"))')
           
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
+          cache: always
           extra-packages: any::rcmdcheck
           needs: check
           
@@ -78,11 +79,10 @@ jobs:
         shell: Rscript {0}
         run: |
           python_packages <- 
-            c("polars", "tqdm", "connectorx", "pyarrow", "scikit-learn")
+            c("polars", "tqdm", "connectorx", "pyarrow", "pynvml", "numpy==1.26.4")
           
           library(reticulate)
-          virtualenv_create("r-reticulate", Sys.which("python"))
-          virtualenv_install("r-reticulate", python_packages)
+          virtualenv_create("r-reticulate", Sys.which("python"), packages=python_packages)
           virtualenv_install("r-reticulate", "torch", pip_options = c("--index-url https://download.pytorch.org/whl/cpu"))
           
           path_to_python <- virtualenv_python("r-reticulate")
@@ -95,24 +95,24 @@ jobs:
           error-on: '"warning"'
           check-dir: '"check"'
 
-      - name: Upload source package
-        if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
-        uses: actions/upload-artifact@v2
-        with:
-          name: package_tarball
-          path: check/*.tar.gz
-
       - name: Install covr
-        if: runner.os == 'Windows'
+        if: runner.os == 'ubuntu-22.04'
         run: |
           remotes::install_cran("covr")
         shell: Rscript {0}
 
       - name: Test coverage
-        if: runner.os == 'Windows'
-        run: covr::codecov()
+        if: runner.os == 'ubuntu-22.04'
+        run: covr::codecov(token = "${{ secrets.CODECOV_TOKEN }}")
         shell: Rscript {0}
 
+      - name: Upload source package
+        if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
+        uses: actions/upload-artifact@v2
+        with:
+          name: package_tarball
+          path: check/*.tar.gz
+
   Release:
     needs: R-CMD-Check
     
diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml
new file mode 100644
index 0000000..c0fe978
--- /dev/null
+++ b/.github/workflows/release_docker.yaml
@@ -0,0 +1,84 @@
+# When a new release is published,
+# upload image to Dockerhub.
+#
+# Requires the following repository secrets:
+# - DOCKER_IMAGE - Configured as a secret so it can be configured per fork.
+# - DOCKER_HUB_USERNAME
+# - DOCKER_HUB_ACCESS_TOKEN
+# - GITHUBPAT - The github account to use for downloading CRAN dependencies.
+#                      Needed to avoid "API rate limit exceeded" from github.
+name: Release Docker
+
+on:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+  workflow_dispatch:
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    env:
+      DOCKER_IMAGE: 'ohdsi/deep_plp' 
+    steps:
+      - uses: actions/checkout@v4
+
+      # ------------------------------------
+      # The pattern for the following steps is specified
+      # in OHDSI/WebAPI.
+
+      # Add Docker labels and tags
+      - name: Docker meta
+        id: docker_meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.DOCKER_IMAGE }}
+          tags: |
+            type=semver,pattern={{version}}
+      # Setup docker build environment
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+      - name: Set build parameters
+        id: build_params
+        run: |
+          echo "SHA8=${GITHUB_SHA::8}" >> $GITHUB_ENV
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v6
+        with:
+          context: ./
+          cache-from: type=gha
+          cache-to: type=gha, mode=max
+          file: Dockerfile
+          platforms: linux/amd64
+          push: true
+          secrets: |
+            build_github_pat=${{ secrets.GH_TOKEN }}
+          build-args: |
+            GIT_BRANCH=${{ steps.docker_meta.outputs.version }}
+            GIT_COMMIT_ID_ABBREV=${{ env.SHA8 }}
+          tags: ${{ steps.docker_meta.outputs.tags }}
+          # Use runtime labels from docker_meta as well as fixed labels
+          labels: |
+            ${{ steps.docker_meta.outputs.labels }}
+            maintainer=Egill A. Fridgeirsson <e.fridgeirsson@erasmusmc.nl>
+            org.opencontainers.image.authors=Egill A. Fridgeirsson <e.fridgeirsson@erasmusmc.nl>, Henrik John <l.john@erasmusmc.nl>
+            org.opencontainers.image.vendor=OHDSI
+            org.opencontainers.image.licenses=Apache-2.0
+
+      - name: Inspect image
+        run: |
+          docker pull ${{ env.DOCKER_IMAGE }}:${{ steps.docker_meta.outputs.version }}
+          docker image inspect ${{ env.DOCKER_IMAGE }}:${{ steps.docker_meta.outputs.version }}
diff --git a/.gitignore b/.gitignore
index feec6ee..aec36f9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,6 @@ extras/
 .Renviron
 inst/python/__pycache__
 .mypy_cache
+/doc/
+/Meta/
+*.pt
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
index f1e2d9f..06cf7da 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 2.0.3
-Date: 22-12-2023
+Version: 2.1.0
+Date: 08-07-2024
 Authors@R: c(
     person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")),
     person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut")), 
@@ -20,31 +20,28 @@ Depends:
     R (>= 4.0.0)
 Imports:
     dplyr,
-    FeatureExtraction (>= 3.0.0),
     ParallelLogger (>= 2.0.0),
     PatientLevelPrediction (>= 6.3.2),
     rlang,
     withr,
     reticulate (>= 1.31)
 Suggests:
-    devtools,
     Eunomia,
     knitr,
-    markdown,
-    plyr,  
+    rmarkdown,
     testthat,
     PRROC,
+    FeatureExtraction (>= 3.0.0),
     ResultModelManager (>= 0.2.0),
     DatabaseConnector (>= 6.0.0),
     Andromeda
 Remotes:
     ohdsi/PatientLevelPrediction,
-    ohdsi/FeatureExtraction,
-    ohdsi/Eunomia,  
     ohdsi/ResultModelManager
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Encoding: UTF-8
 Config/testthat/edition: 3
+Config/testthat/parallel: TRUE
 Config/reticulate:
   list(
     packages = list(
@@ -52,6 +49,7 @@ Config/reticulate:
       list(package = "polars"),
       list(package = "tqdm"),
       list(package = "connectorx"),
-      list(package = "pyarrow")
+      list(package = "pyarrow"),
+      list(package = "pynvml")
       )
   )
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..26e7082
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,64 @@
+FROM docker.io/rocker/r-ver:4.4.1 AS build
+
+RUN --mount=type=secret,id=build_github_pat export GITHUB_PAT=$(cat /run/secrets/build_github_pat)
+
+ARG GIT_BRANCH='main'
+ARG GIT_COMMIT_ID_ABBREV
+
+RUN apt-get -y update && apt-get install -y \
+      default-jre \
+      default-jdk \
+      libssl-dev  \
+      python3-pip \
+      python3-dev \
+      --no-install-recommends \
+      && apt-get clean \
+      && rm -rf /var/lib/apt/lists/*
+RUN R CMD javareconf
+
+RUN install2.r -n -1 \
+        remotes \
+        CirceR \
+        Eunomia \
+        duckdb \
+    && installGithub.r \
+        OHDSI/CohortGenerator \
+        OHDSI/ROhdsiWebApi \
+        OHDSI/ResultModelManager
+        
+RUN Rscript -e "DatabaseConnector::downloadJdbcDrivers(dbms='all', pathToDriver='/database_drivers/')"
+ENV DATABASECONNECTOR_JAR_FOLDER=/database_drivers/
+
+# install Python packages
+RUN pip3 install uv \
+    && uv pip install --system --no-cache-dir \
+    connectorx \
+    polars \
+    pyarrow \
+    torch \
+    tqdm \
+    pynvml \
+    && rm -rf /root/.cache/pip
+
+RUN Rscript -e "ref <- Sys.getenv('GIT_COMMIT_ID_ABBREV', unset = Sys.getenv('GIT_BRANCH')); remotes::install_github('ohdsi/DeepPatientLevelPrediction', ref=ref)"
+
+
+FROM docker.io/rocker/rstudio:4.4.1
+#
+COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+COPY --from=build /database_drivers /database_drivers
+COPY --from=build /usr/local/lib/R/site-library /usr/local/lib/R/site-library
+COPY --from=build /usr/local/lib/R/library /usr/local/lib/R/library
+
+ENV RETICULATE_PYTHON=/usr/bin/python3
+# runtime dependanceis
+RUN apt-get -y update && apt-get install -y \
+      default-jre \
+      default-jdk \
+      libssl3 \
+      python3-dev \
+      --no-install-recommends \
+      && apt-get clean \
+      && rm -rf /var/lib/apt/lists/* \
+      && R CMD javareconf
+
diff --git a/NAMESPACE b/NAMESPACE
index 5f8c13e..3b2cef0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,9 +6,11 @@ export(predictDeepEstimator)
 export(setDefaultResNet)
 export(setDefaultTransformer)
 export(setEstimator)
+export(setFinetuner)
 export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
+export(torch)
 export(trainingCache)
 importFrom(dplyr,"%>%")
 importFrom(reticulate,py_to_r)
diff --git a/NEWS.md b/NEWS.md
index 41b7e71..69d551c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,15 @@
+DeepPatientLevelPrediction 2.1.0
+======================
+  - Added basic transfer learning functionality. See vignette("TransferLearning")
+  - Add a gpu memory cleaner to clean cached memory after out of memory error
+  - The python module torch is now accessed through an exported function instead of loading the module at package load
+  - Added gradient accumulation. Studies running at different sites using different hardware can now use same effective batch size by accumulating gradients.
+  - Refactored out the cross validation from the hyperparameter tuning
+  - Remove predictions from non-optimal hyperparameter combinations to save space
+  - Only use html vignettes 
+  - Rename MLP to MultiLayerPerceptron
+  
+
 DeepPatientLevelPrediction 2.0.3
 ======================
   - Hotfix: Fix count for polars v0.20.x
diff --git a/R/Dataset.R b/R/Dataset.R
index 4ba4e81..3bba33e 100644
--- a/R/Dataset.R
+++ b/R/Dataset.R
@@ -22,14 +22,21 @@ createDataset <- function(data, labels, plpModel = NULL) {
     # sqlite object
     attributes(data)$path <- attributes(data)$dbname
   }
-  if (is.null(plpModel)) {
+  if (is.null(plpModel) && is.null(data$numericalIndex)) {
     data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
                     r_to_py(labels$outcomeCount))
+  } else if (!is.null(data$numericalIndex)) {
+    numericalIndex <-
+      r_to_py(as.array(data$numericalIndex %>% dplyr::pull()))
+    data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
+                    r_to_py(labels$outcomeCount),
+                    numericalIndex)
   } else {
     numericalFeatures <-
       r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))
     data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
-                    numerical_features = numericalFeatures)
+      numerical_features = numericalFeatures
+    )
   }
 
   return(data)
diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
index 137b53a..f1c3c7a 100644
--- a/R/DeepPatientLevelPrediction.R
+++ b/R/DeepPatientLevelPrediction.R
@@ -21,12 +21,25 @@
 #' @description A package containing deep learning extensions for developing
 #' prediction models using data in the OMOP CDM
 #'
-#' @docType package
 #' @name DeepPatientLevelPrediction
 #' @importFrom dplyr %>%
 #' @importFrom reticulate r_to_py py_to_r
 #' @importFrom rlang .data
-NULL
+"_PACKAGE"
+
+# package level global state
+.globals <- new.env(parent = emptyenv())
+
+#' Pytorch module
+#'
+#' The `torch` module object is the equivalent of
+#' `reticulate::import("torch")` and provided mainly as a convenience.
+#'
+#' @returns the torch Python module
+#' @export
+#' @usage NULL
+#' @format An object of class `python.builtin.module`
+torch <- NULL
 
 .onLoad <- function(libname, pkgname) {
   # use superassignment to update global reference
diff --git a/R/Estimator.R b/R/Estimator.R
index c3c6725..ece685c 100644
--- a/R/Estimator.R
+++ b/R/Estimator.R
@@ -27,8 +27,7 @@
 #' @param batchSize batchSize to use
 #' @param epochs  how many epochs to train for
 #' @param device  what device to train on, can be a string or a function to
-#' that evaluates
-#' to the device during runtime
+#' that evaluates to the device during runtime
 #' @param optimizer which optimizer to use
 #' @param scheduler which learning rate scheduler to use
 #' @param criterion loss function to use
@@ -40,26 +39,32 @@
 #' `name`,
 #' `fun` needs to be a function that takes in prediction and labels and
 #' outputs a score.
+#' @param accumulationSteps how many steps to accumulate gradients before
+#' updating weights, can also be a function that is evaluated during runtime
 #' @param seed seed to initialize weights of model with
 #' @export
-setEstimator <- function(learningRate = "auto",
-  weightDecay = 0.0,
-  batchSize = 512,
-  epochs = 30,
-  device = "cpu",
-  optimizer = torch$optim$AdamW,
-  scheduler = list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau,
-                   params = list(patience = 1)),
-  criterion = torch$nn$BCEWithLogitsLoss,
-  earlyStopping = list(useEarlyStopping = TRUE,
-                       params = list(patience = 4)),
-  metric = "auc",
-  seed = NULL
-) {
-
+setEstimator <- function(
+    learningRate = "auto",
+    weightDecay = 0.0,
+    batchSize = 512,
+    epochs = 30,
+    device = "cpu",
+    optimizer = torch$optim$AdamW,
+    scheduler = list(
+      fun = torch$optim$lr_scheduler$ReduceLROnPlateau,
+      params = list(patience = 1)
+    ),
+    criterion = torch$nn$BCEWithLogitsLoss,
+    earlyStopping = list(
+      useEarlyStopping = TRUE,
+      params = list(patience = 4)
+    ),
+    metric = "auc",
+    accumulationSteps = NULL,
+    seed = NULL) {
   checkIsClass(learningRate, c("numeric", "character"))
   if (inherits(learningRate, "character") && learningRate != "auto") {
-    stop(paste0('Learning rate should be either a numeric or "auto", 
+    stop(paste0('Learning rate should be either a numeric or "auto",
                 you provided: ', learningRate))
   }
   checkIsClass(weightDecay, "numeric")
@@ -71,7 +76,14 @@ setEstimator <- function(learningRate = "auto",
   checkIsClass(earlyStopping, c("list", "NULL"))
   checkIsClass(metric, c("character", "list"))
   checkIsClass(seed, c("numeric", "integer", "NULL"))
-
+  
+  if (!is.null(accumulationSteps) && !is.function(accumulationSteps)) {
+    checkHigher(accumulationSteps, 0)
+    checkIsClass(accumulationSteps, c("numeric", "integer"))
+    if (batchSize %% accumulationSteps != 0) {
+      stop("Batch size should be divisible by accumulation steps")
+    }
+  }
 
   if (length(learningRate) == 1 && learningRate == "auto") {
     findLR <- TRUE
@@ -81,34 +93,52 @@ setEstimator <- function(learningRate = "auto",
   if (is.null(seed)) {
     seed <- as.integer(sample(1e5, 1))
   }
-  estimatorSettings <- list(learningRate = learningRate,
-                            weightDecay = weightDecay,
-                            batchSize = batchSize,
-                            epochs = epochs,
-                            device = device,
-                            earlyStopping = earlyStopping,
-                            findLR = findLR,
-                            metric = metric,
-                            seed = seed[1])
+  estimatorSettings <- list(
+    learningRate = learningRate,
+    weightDecay = weightDecay,
+    batchSize = batchSize,
+    epochs = epochs,
+    device = device,
+    earlyStopping = earlyStopping,
+    findLR = findLR,
+    metric = metric,
+    accumulationSteps = accumulationSteps,
+    seed = seed[1]
+  )
 
   optimizer <- rlang::enquo(optimizer)
   estimatorSettings$optimizer <- function() rlang::eval_tidy(optimizer)
-  class(estimatorSettings$optimizer) <- c("delayed",
-                                          class(estimatorSettings$optimizer))
+  class(estimatorSettings$optimizer) <- c(
+    "delayed",
+    class(estimatorSettings$optimizer)
+  )
 
   criterion <- rlang::enquo(criterion)
   estimatorSettings$criterion <- function() rlang::eval_tidy(criterion)
-  class(estimatorSettings$criterion) <- c("delayed",
-                                          class(estimatorSettings$criterion))
+  class(estimatorSettings$criterion) <- c(
+    "delayed",
+    class(estimatorSettings$criterion)
+  )
 
   scheduler <- rlang::enquo(scheduler)
   estimatorSettings$scheduler <- function() rlang::eval_tidy(scheduler)
-  class(estimatorSettings$scheduler) <- c("delayed",
-                                          class(estimatorSettings$scheduler))
+  class(estimatorSettings$scheduler) <- c(
+    "delayed",
+    class(estimatorSettings$scheduler)
+  )
 
   if (is.function(device)) {
-    class(estimatorSettings$device) <- c("delayed",
-                                         class(estimatorSettings$device))
+    class(estimatorSettings$device) <- c(
+      "delayed",
+      class(estimatorSettings$device)
+    )
+  }
+  
+  if (is.function(accumulationSteps)) {
+    class(estimatorSettings$accumulationSteps) <- c(
+      "delayed",
+      class(estimatorSettings$accumulationSteps)
+    )
   }
 
   estimatorSettings$paramsToTune <- extractParamsToTune(estimatorSettings)
@@ -133,19 +163,29 @@ fitEstimator <- function(trainData,
                          analysisPath,
                          ...) {
   start <- Sys.time()
-
-  # check covariate data
-  if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
-    stop("Needs correct covariateData")
-  }
-
   if (!is.null(trainData$folds)) {
     trainData$labels <- merge(trainData$labels, trainData$fold, by = "rowId")
   }
-  mappedCovariateData <- PatientLevelPrediction::MapIds(
-    covariateData = trainData$covariateData,
-    cohort = trainData$labels
-  )
+
+  if (modelSettings$modelType == "Finetuner") {
+    # make sure to use same mapping from covariateIds to columns if finetuning
+    path <- modelSettings$param[[1]]$modelPath
+    oldCovImportance <- utils::read.csv(file.path(path,
+                                                  "covariateImportance.csv"))
+    mapping <- oldCovImportance %>% dplyr::select("columnId", "covariateId")
+    numericalIndex <- which(oldCovImportance %>% dplyr::pull("isNumeric"))
+    mappedCovariateData <- PatientLevelPrediction::MapIds(
+      covariateData = trainData$covariateData,
+      cohort = trainData$labels,
+      mapping = mapping
+    )
+    mappedCovariateData$numericalIndex <- as.data.frame(numericalIndex)
+  } else {
+    mappedCovariateData <- PatientLevelPrediction::MapIds(
+      covariateData = trainData$covariateData,
+      cohort = trainData$labels
+    )
+  }
 
   covariateRef <- mappedCovariateData$covariateRef
 
@@ -161,13 +201,16 @@ fitEstimator <- function(trainData,
     )
   )
 
-  hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch,
-                                        function(x) x$hyperSummary))
+  hyperSummary <- do.call(rbind, lapply(
+    cvResult$paramGridSearch,
+    function(x) x$hyperSummary
+  ))
   prediction <- cvResult$prediction
   incs <- rep(1, covariateRef %>%
-                dplyr::tally() %>%
-                dplyr::collect() %>%
-                as.integer())
+      dplyr::tally() %>%
+      dplyr::collect() %>%
+      as.integer()
+  )
   covariateRef <- covariateRef %>%
     dplyr::arrange("columnId") %>%
     dplyr::collect() %>%
@@ -180,26 +223,35 @@ fitEstimator <- function(trainData,
   comp <- start - Sys.time()
   result <- list(
     model = cvResult$estimator,
-
     preprocessing = list(
-      featureEngineering = attr(trainData$covariateData,
-                                "metaData")$featureEngineering,
-      tidyCovariates = attr(trainData$covariateData,
-                            "metaData")$tidyCovariateDataSettings,
+      featureEngineering = attr(
+        trainData$covariateData,
+        "metaData"
+      )$featureEngineering,
+      tidyCovariates = attr(
+        trainData$covariateData,
+        "metaData"
+      )$tidyCovariateDataSettings,
       requireDenseMatrix = FALSE
     ),
     prediction = prediction,
     modelDesign = PatientLevelPrediction::createModelDesign(
       targetId = attr(trainData, "metaData")$targetId,
       outcomeId = attr(trainData, "metaData")$outcomeId,
-      restrictPlpDataSettings = attr(trainData,
-                                     "metaData")$restrictPlpDataSettings,
+      restrictPlpDataSettings = attr(
+        trainData,
+        "metaData"
+      )$restrictPlpDataSettings,
       covariateSettings = attr(trainData, "metaData")$covariateSettings,
       populationSettings = attr(trainData, "metaData")$populationSettings,
-      featureEngineeringSettings = attr(trainData$covariateData,
-                                        "metaData")$featureEngineeringSettings,
-      preprocessSettings = attr(trainData$covariateData,
-                                "metaData")$preprocessSettings,
+      featureEngineeringSettings = attr(
+        trainData$covariateData,
+        "metaData"
+      )$featureEngineeringSettings,
+      preprocessSettings = attr(
+        trainData$covariateData,
+        "metaData"
+      )$preprocessSettings,
       modelSettings = modelSettings,
       splitSettings = attr(trainData, "metaData")$splitSettings,
       sampleSettings = attr(trainData, "metaData")$sampleSettings
@@ -219,7 +271,8 @@ fitEstimator <- function(trainData,
   )
 
   class(result) <- "plpModel"
-  attr(result, "predictionFunction") <- "predictDeepEstimator"
+  attr(result, "predictionFunction") <-
+    "DeepPatientLevelPrediction::predictDeepEstimator"
   attr(result, "modelType") <- "binary"
   attr(result, "saveType") <- modelSettings$saveType
 
@@ -255,13 +308,20 @@ predictDeepEstimator <- function(plpModel,
   # get predictions
   prediction <- cohort
   if (is.character(plpModel$model)) {
-    modelSettings <- plpModel$modelDesign$modelSettings
     model <- torch$load(file.path(plpModel$model,
-                                  "DeepEstimatorModel.pt"),
+                                  "DeepEstimatorModel.pt"),                        
                         map_location = "cpu")
-    estimator <- createEstimator(modelType = modelSettings$modelType,
-                                 modelParameters = model$model_parameters,
-                                 estimatorSettings = model$estimator_settings)
+    if (is.null(model$model_parameters$model_type)) {
+      # for backwards compatibility
+      model$model_parameters$model_type <- plpModel$modelDesign$modelSettings$modelType
+    }
+    model$estimator_settings$device <-
+      plpModel$modelDesign$modelSettings$estimatorSettings$device
+    estimator <-
+      createEstimator(modelParameters =
+                      snakeCaseToCamelCaseNames(model$model_parameters),
+                      estimatorSettings =
+                      snakeCaseToCamelCaseNames(model$estimator_settings))
     estimator$model$load_state_dict(model$model_state_dict)
     prediction$value <- estimator$predict_proba(data)
   } else {
@@ -292,152 +352,81 @@ gridCvDeep <- function(mappedData,
                        modelLocation,
                        analysisPath) {
   ParallelLogger::logInfo(paste0("Running hyperparameter search for ",
-                                 modelSettings$modelType, " model"))
-
+                                 modelSettings$modelType,
+                                 " model"))
   ###########################################################################
 
   paramSearch <- modelSettings$param
 
-  # TODO below chunk should be in a setupCache function
-  trainCache <- trainingCache$new(analysisPath)
-  if (trainCache$isParamGridIdentical(paramSearch)) {
-    gridSearchPredictons <- trainCache$getGridSearchPredictions()
-  } else {
-    gridSearchPredictons <- list()
-    length(gridSearchPredictons) <- length(paramSearch)
-    trainCache$saveGridSearchPredictions(gridSearchPredictons)
-    trainCache$saveModelParams(paramSearch)
-  }
+  # setup cache for hyperparameterResults
+  trainCache <- setupCache(analysisPath, paramSearch)
+  hyperparameterResults <- trainCache$getGridSearchPredictions()
 
   dataset <- createDataset(data = mappedData, labels = labels)
 
-  fitParams <- names(paramSearch[[1]])[grepl("^estimator",
-                                             names(paramSearch[[1]]))]
-  findLR <- modelSettings$estimatorSettings$findLR
   if (!trainCache$isFull()) {
     for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
-      ParallelLogger::logInfo(paste0("Running hyperparameter combination no ",
-                                     gridId))
+      ParallelLogger::logInfo(paste0(
+        "Running hyperparameter combination no ",
+        gridId
+      ))
       ParallelLogger::logInfo(paste0("HyperParameters: "))
       ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]),
-                                    paramSearch[[gridId]], collapse = " | "))
-      currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames]
-
-      currentEstimatorSettings <-
-        fillEstimatorSettings(modelSettings$estimatorSettings,
-                              fitParams,
-                              paramSearch[[gridId]])
-      currentEstimatorSettings$modelType <- modelSettings$modelType
-      currentModelParams$catFeatures <- dataset$get_cat_features()$max()
-      currentModelParams$numFeatures <-
-        dataset$get_numerical_features()$len()
-      if (findLR) {
-        lrFinder <- createLRFinder(modelType = modelSettings$modelType,
-          modelParameters = currentModelParams,
-          estimatorSettings = currentEstimatorSettings
+        paramSearch[[gridId]],
+        collapse = " | "
+      ))
+      hyperparameterResults[[gridId]] <-
+        doCrossValidation(dataset,
+          labels = labels,
+          parameters = paramSearch[[gridId]],
+          modelSettings = modelSettings
         )
-        lr <- lrFinder$get_lr(dataset)
-        ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
-        currentEstimatorSettings$learningRate <- lr
-      }
-
-      crossValidationResults <-
-        doCrossvalidation(dataset,
-                          labels = labels,
-                          modelSettings = currentModelParams,
-                          estimatorSettings = currentEstimatorSettings)
-      learnRates <- crossValidationResults$learnRates
-      prediction <- crossValidationResults$prediction
-
-      gridPerformance <-
-        PatientLevelPrediction::computeGridPerformance(prediction,
-                                                       paramSearch[[gridId]])
-      maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
-      gridSearchPredictons[[gridId]] <- list(
-        prediction = prediction,
-        param = paramSearch[[gridId]],
-        gridPerformance = gridPerformance
-      )
-      gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <-
-        rep(list(unlist(learnRates[[maxIndex]]$LRs)),
-            nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))
-      gridSearchPredictons[[gridId]]$param$learnSchedule <-
-        learnRates[[maxIndex]]
-      # remove all predictions that are not the max performance
-      indexOfMax <-
-        which.max(unlist(lapply(gridSearchPredictons,
-                                function(x) x$gridPerformance$cvPerformance)))
-      for (i in seq_along(gridSearchPredictons)) {
-        if (!is.null(gridSearchPredictons[[i]]) && i != indexOfMax) {
-          gridSearchPredictons[[i]]$prediction <- list(NULL)
-        }
-      }
-      ParallelLogger::logInfo(paste0("Caching all grid search results and 
-                                     prediction for best combination ",
-                                     indexOfMax))
-      trainCache$saveGridSearchPredictions(gridSearchPredictons)
+     # remove all predictions that are not the max performance
+     hyperparameterResults <- trainCache$trimPerformance(hyperparameterResults) 
+     trainCache$saveGridSearchPredictions(hyperparameterResults)
     }
   }
-  paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
+  paramGridSearch <- lapply(hyperparameterResults,
+                            function(x) x$gridPerformance)
   # get best params
   indexOfMax <-
-    which.max(unlist(lapply(gridSearchPredictons,
-                            function(x) x$gridPerformance$cvPerformance)))
-  finalParam <- gridSearchPredictons[[indexOfMax]]$param
+    which.max(unlist(lapply(
+      hyperparameterResults,
+      function(x) x$gridPerformance$cvPerformance
+    )))
+  if (length(indexOfMax) == 0) {
+    stop("No hyperparameter combination has valid results")
+  }
+  finalParam <- hyperparameterResults[[indexOfMax]]$param
 
-  paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
+  paramGridSearch <- lapply(hyperparameterResults,
+                            function(x) x$gridPerformance)
 
   # get best CV prediction
-  cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction
+  cvPrediction <- hyperparameterResults[[indexOfMax]]$prediction
   cvPrediction$evaluationType <- "CV"
 
   ParallelLogger::logInfo("Training final model using optimal parameters")
-  # get the params
-  modelParams <- finalParam[modelSettings$modelParamNames]
-
-
-  # create the dir
-  if (!dir.exists(file.path(modelLocation))) {
-    dir.create(file.path(modelLocation), recursive = TRUE)
-  }
-
-  modelParams$catFeatures <- dataset$get_cat_features()$max()
-  modelParams$numFeatures <- dataset$get_numerical_features()$len()
-
-
-  estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings,
-                                             fitParams,
-                                             finalParam)
-  estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]]
-  estimator <- createEstimator(modelType = modelSettings$modelType,
-                               modelParameters = modelParams,
-                               estimatorSettings = estimatorSettings)
-
-  numericalIndex <- dataset$get_numerical_features()
-  estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs)
-
-  ParallelLogger::logInfo("Calculating predictions on all train data...")
-  prediction <- predictDeepEstimator(
-    plpModel = estimator,
-    data = dataset,
-    cohort = labels
-  )
-  prediction$evaluationType <- "Train"
-
+  trainPrediction <- trainFinalModel(dataset,
+                                     finalParam,
+                                     modelSettings,
+                                     labels)
   prediction <- rbind(
-    prediction,
+    trainPrediction$prediction,
     cvPrediction
   )
-  # modify prediction
+  # remove fold index from predictions and remove cohortStartDate
   prediction <- prediction %>%
     dplyr::select(-"index")
-
   prediction$cohortStartDate <- as.Date(prediction$cohortStartDate,
-                                        origin = "1970-01-01")
-
-
+    origin = "1970-01-01")
+  numericalIndex <- dataset$get_numerical_features()
+  
   # save torch code here
-  estimator$save(modelLocation, "DeepEstimatorModel.pt")
+  if (!dir.exists(file.path(modelLocation))) {
+    dir.create(file.path(modelLocation), recursive = TRUE)
+  }
+  trainPrediction$estimator$save(modelLocation, "DeepEstimatorModel.pt")
   return(
     list(
       estimator = modelLocation,
@@ -476,48 +465,159 @@ evalEstimatorSettings <- function(estimatorSettings) {
   estimatorSettings
 }
 
-createEstimator <- function(modelType,
-                            modelParameters,
+createEstimator <- function(modelParameters,
                             estimatorSettings) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
 
-  model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
+  if (modelParameters$modelType == "Finetuner") {
+    estimatorSettings$finetune <- TRUE
+    plpModel <- PatientLevelPrediction::loadPlpModel(modelParameters$modelPath)
+    estimatorSettings$finetuneModelPath <-
+      normalizePath(file.path(plpModel$model, "DeepEstimatorModel.pt"))
+    modelParameters$modelType <-
+      plpModel$modelDesign$modelSettings$modelType
+  }
+
+  model <-
+    reticulate::import_from_path(modelParameters$modelType,
+                                 path = path)[[modelParameters$modelType]]
   estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator
 
   modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
   estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
   estimatorSettings <- evalEstimatorSettings(estimatorSettings)
 
-  estimator <- estimator(model = model,
-                         model_parameters = modelParameters,
-                         estimator_settings = estimatorSettings)
+  estimator <- estimator(
+    model = model,
+    model_parameters = modelParameters,
+    estimator_settings = estimatorSettings
+  )
   return(estimator)
 }
 
-doCrossvalidation <- function(dataset,
+doCrossValidation <- function(dataset,
                               labels,
-                              modelSettings,
-                              estimatorSettings) {
+                              parameters,
+                              modelSettings
+                              ) {
+  crossValidationResults <-
+    tryCatch(doCrossValidationImpl(dataset,
+                                   labels,
+                                   parameters,
+                                   modelSettings),
+             error = function(e) {
+               if (inherits(e, "torch.cuda.OutOfMemoryError")) {
+                 ParallelLogger::logError(
+                   "Out of memory error during cross validation, 
+                   trying to continue with next hyperparameter combination"
+                 )
+                 crossValidationResults <- list()
+                 crossValidationResults$prediction <- labels
+                 crossValidationResults$prediction <- 
+                   cbind(crossValidationResults$prediction, value = NA)
+                 attr(crossValidationResults$prediction,
+                      "metaData")$modelType <- "binary"
+                 crossValidationResults$param <- parameters
+                 crossValidationResults$param$learnSchedule <- list(
+                   LRs = NA,
+                   bestEpoch = NA
+                 )
+                 nFolds <- max(labels$index)
+                 hyperSummary <- 
+                   data.frame(metric = rep("computeAuc", nFolds + 1),
+                              fold = c("CV", as.character(1:nFolds)),
+                              value = NA)
+                 hyperSummary <- cbind(hyperSummary, parameters)
+                 hyperSummary$learnRates <- NA
+                                    
+                 gridPerformance <- list(
+                   metric = "computeAuc",
+                   cvPerformance = NA,
+                   cvPerformancePerFold = rep(NA, nFolds),
+                   param = parameters,
+                   hyperSummary = hyperSummary
+                 )
+                crossValidationResults$gridPerformance <- gridPerformance
+                learnRates <- list()
+                for (i in 1:nFolds) {
+                  learnRates[[i]] <- list(
+                    LRs = NA,
+                    bestEpoch = NA
+                  )
+                }
+                crossValidationResults$learnRates <- learnRates
+                return(crossValidationResults)
+                } else {
+                  stop(e)
+                }
+              }
+            )
+  gridSearchPredictions <- list(
+    prediction = crossValidationResults$prediction,
+    param = parameters,
+    gridPerformance = crossValidationResults$gridPerformance
+  ) 
+  maxIndex <- which.max(unlist(sapply(crossValidationResults$learnRates, 
+                                      `[`, 2)))
+  if (length(maxIndex) != 0) {
+    gridSearchPredictions$gridPerformance$hyperSummary$learnRates <-
+      rep(
+        list(unlist(crossValidationResults$learnRates[[maxIndex]]$LRs)),
+        nrow(gridSearchPredictions$gridPerformance$hyperSummary)
+      )
+    gridSearchPredictions$param$learnSchedule <-
+      crossValidationResults$learnRates[[maxIndex]]
+  }
+  return(gridSearchPredictions)
+}
+
+doCrossValidationImpl <- function(dataset,
+                                  labels,
+                                  parameters,
+                                  modelSettings) {
+  fitParams <- names(parameters)[grepl(
+    "^estimator",
+    names(parameters)
+  )]
+  currentModelParams <- parameters[modelSettings$modelParamNames]
+  attr(currentModelParams, "metaData")$names <-
+    modelSettings$modelParamNameCH
+  currentModelParams$modelType <- modelSettings$modelType
+  currentEstimatorSettings <-
+    fillEstimatorSettings(modelSettings$estimatorSettings,
+                          fitParams,
+                          parameters)
+  currentModelParams$catFeatures <- dataset$get_cat_features()$max()
+  currentModelParams$numFeatures <- dataset$get_numerical_features()$len()
+  if (currentEstimatorSettings$findLR) {
+    lr <- getLR(currentModelParams, currentEstimatorSettings, dataset)
+    ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
+    currentEstimatorSettings$learningRate <- lr
+  }
+
   fold <- labels$index
   ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
   learnRates <- list()
   prediction <- NULL
+  path <- system.file("python", package = "DeepPatientLevelPrediction")
+  fit_estimator <- reticulate::import_from_path("Estimator",
+                                                path = path)$fit_estimator
   for (i in 1:max(fold)) {
     ParallelLogger::logInfo(paste0("Fold ", i))
 
     # -1 for python 0-based indexing
     trainDataset <- torch$utils$data$Subset(dataset,
-                                            indices =
-                                              as.integer(which(fold != i) - 1))
+      indices =
+        as.integer(which(fold != i) - 1)
+    )
 
     # -1 for python 0-based indexing
     testDataset <- torch$utils$data$Subset(dataset,
                                            indices =
                                              as.integer(which(fold == i) - 1))
-    estimator <- createEstimator(modelType = estimatorSettings$modelType,
-                                 modelParameters = modelSettings,
-                                 estimatorSettings = estimatorSettings)
-    estimator$fit(trainDataset, testDataset)
+    estimator <- createEstimator(modelParameters = currentModelParams,
+                                 estimatorSettings = currentEstimatorSettings)
+    fit_estimator(estimator, trainDataset, testDataset)
 
     ParallelLogger::logInfo("Calculating predictions on left out fold set...")
 
@@ -534,11 +634,16 @@ doCrossvalidation <- function(dataset,
       bestEpoch = estimator$best_epoch
     )
   }
-  return(results = list(prediction = prediction,
-                        learnRates = learnRates))
-
+  gridPerformance <- PatientLevelPrediction::computeGridPerformance(prediction,
+                                                                    parameters)
+  return(results = list(
+    prediction = prediction,
+    learnRates = learnRates,
+    gridPerformance = gridPerformance
+  ))
 }
 
+
 extractParamsToTune <- function(estimatorSettings) {
   paramsToTune <- list()
   for (name in names(estimatorSettings)) {
@@ -557,3 +662,34 @@ extractParamsToTune <- function(estimatorSettings) {
   }
   return(paramsToTune)
 }
+
+trainFinalModel <- function(dataset, finalParam, modelSettings, labels) {
+    # get the params
+    modelParams <- finalParam[modelSettings$modelParamNames]
+  
+    fitParams <- names(finalParam)[grepl("^estimator", names(finalParam))]
+    
+    modelParams$catFeatures <- dataset$get_cat_features()$max()
+    modelParams$numFeatures <- dataset$get_numerical_features()$len()
+    modelParams$modelType <- modelSettings$modelType
+  
+    estimatorSettings <- fillEstimatorSettings(
+      modelSettings$estimatorSettings,
+      fitParams,
+      finalParam
+    )
+    estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]]
+    estimator <- createEstimator(modelParameters = modelParams,
+                                 estimatorSettings = estimatorSettings)
+    estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs)
+  
+    ParallelLogger::logInfo("Calculating predictions on all train data...")
+    prediction <- predictDeepEstimator(
+      plpModel = estimator,
+      data = dataset,
+      cohort = labels
+    )
+    prediction$evaluationType <- "Train"
+    return(list(prediction = prediction, 
+                estimator = estimator))
+}
diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
index a1aa43c..08a18db 100644
--- a/R/HelperFunctions.R
+++ b/R/HelperFunctions.R
@@ -29,6 +29,33 @@ camelCaseToSnakeCase <- function(string) {
   return(string)
 }
 
+#' Convert a camel case string to snake case
+#'
+#' @param string   The string to be converted
+#'
+#' @return
+#' A string
+#'
+snakeCaseToCamelCase <- function(string) {
+  string <- tolower(string)
+  for (letter in letters) {
+    string <- gsub(paste("_", letter, sep = ""), toupper(letter), string)
+  }
+  string <- gsub("_([0-9])", "\\1", string)
+  return(string)
+}
+
+#' Convert the names of an object from snake case to camel case
+#'
+#' @param object   The object of which the names should be converted
+#'
+#' @return
+#' The same object, but with converted names.
+snakeCaseToCamelCaseNames <- function(object) {
+  names(object) <- snakeCaseToCamelCase(names(object))
+  return(object)
+}
+
 #' Convert the names of an object from camel case to snake case
 #'
 #' @param object   The object of which the names should be converted
diff --git a/R/LRFinder.R b/R/LRFinder.R
index 4ab006c..0bca6dc 100644
--- a/R/LRFinder.R
+++ b/R/LRFinder.R
@@ -15,32 +15,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-createLRFinder <- function(modelType,
-                           modelParameters,
-                           estimatorSettings,
-                           lrSettings = NULL) {
+getLR <- function(modelParameters,
+                  estimatorSettings,
+                  dataset,
+                  lrSettings = NULL) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  lrFinderClass <-
-    reticulate::import_from_path("LrFinder", path = path)$LrFinder
-  
-  
-
-  model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
-  modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
-  estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
-  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
-  
-  # estimator <- createEstimator(modelType = estimatorSettings$modelType,
-  #                              modelParameters = modelParameters,
-  #                              estimatorSettings = estimatorSettings)
+  estimator <- createEstimator(modelParameters = modelParameters,
+                               estimatorSettings = estimatorSettings)
   if (!is.null(lrSettings)) {
     lrSettings <- camelCaseToSnakeCaseNames(lrSettings)
   }
-  
-  lrFinder <- lrFinderClass(model = model,
-                            model_parameters = modelParameters,
-                            estimator_settings = estimatorSettings,
-                            lr_settings = lrSettings)
-
-  return(lrFinder)
-}
+  get_lr <- reticulate::import_from_path("LrFinder", path)$get_lr
+  lr <- get_lr(estimator, dataset, lrSettings)
+  return(lr)
+}
\ No newline at end of file
diff --git a/R/MLP.R b/R/MultiLayerPerceptron.R
similarity index 77%
rename from R/MLP.R
rename to R/MultiLayerPerceptron.R
index 771e244..b0583d4 100644
--- a/R/MLP.R
+++ b/R/MultiLayerPerceptron.R
@@ -1,4 +1,4 @@
-# @file MLP.R
+# @file MultiLayerPerceptron.R
 #
 # Copyright 2022 Observational Health Data Sciences and Informatics
 #
@@ -43,11 +43,13 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
                                     dropout = c(seq(0, 0.3, 0.05)),
                                     sizeEmbedding = c(2^(6:9)),
                                     estimatorSettings =
-                                      setEstimator(learningRate = "auto",
-                                                   weightDecay = c(1e-6, 1e-3),
-                                                   batchSize = 1024,
-                                                   epochs = 30,
-                                                   device = "cpu"),
+                                      setEstimator(
+                                        learningRate = "auto",
+                                        weightDecay = c(1e-6, 1e-3),
+                                        batchSize = 1024,
+                                        epochs = 30,
+                                        device = "cpu"
+                                      ),
                                     hyperParamSearch = "random",
                                     randomSample = 100,
                                     randomSampleSeed = NULL) {
@@ -81,31 +83,36 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
   if (hyperParamSearch == "random" && randomSample > length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the 
+    stop(paste(
+      "\n Chosen amount of randomSamples is higher than the
                amount of possible hyperparameter combinations.",
-               "\n randomSample:", randomSample, "\n Possible hyperparameter 
+      "\n randomSample:", randomSample, "\n Possible hyperparameter
                combinations:", length(param),
-               "\n Please lower the amount of randomSamples"))
+      "\n Please lower the amount of randomSamples"
+    ))
   }
 
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed,
-                                      {param <- param[sample(length(param),
-                                                             randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed, {
+      param <- param[sample(
+        length(param),
+        randomSample
+      )]
+    }))
   }
-  attr(param, "settings")$modelType <- "MLP"
-
   results <- list(
-    fitFunction = "fitEstimator",
+    fitFunction = "DeepPatientLevelPrediction::fitEstimator",
     param = param,
     estimatorSettings = estimatorSettings,
-    modelType = "MLP",
     saveType = "file",
     modelParamNames = c(
       "numLayers", "sizeHidden",
       "dropout", "sizeEmbedding"
-    )
+    ),
+    modelType = "MultiLayerPerceptron"
   )
+  attr(results$param, "settings")$modelType <- results$modelType
+
 
   class(results) <- "modelSettings"
 
diff --git a/R/ResNet.R b/R/ResNet.R
index 32e7f3a..4b62ba8 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -29,21 +29,25 @@
 
 #' @export
 setDefaultResNet <- function(estimatorSettings =
-                               setEstimator(learningRate = "auto",
-                                            weightDecay = 1e-6,
-                                            device = "cpu",
-                                            batchSize = 1024,
-                                            epochs = 50,
-                                            seed = NULL)) {
-  resnetSettings <- setResNet(numLayers = 6,
-                              sizeHidden = 512,
-                              hiddenFactor = 2,
-                              residualDropout = 0.1,
-                              hiddenDropout = 0.4,
-                              sizeEmbedding = 256,
-                              estimatorSettings = estimatorSettings,
-                              hyperParamSearch = "random",
-                              randomSample = 1)
+                               setEstimator(
+                                 learningRate = "auto",
+                                 weightDecay = 1e-6,
+                                 device = "cpu",
+                                 batchSize = 1024,
+                                 epochs = 50,
+                                 seed = NULL
+                               )) {
+  resnetSettings <- setResNet(
+    numLayers = 6,
+    sizeHidden = 512,
+    hiddenFactor = 2,
+    residualDropout = 0.1,
+    hiddenDropout = 0.4,
+    sizeEmbedding = 256,
+    estimatorSettings = estimatorSettings,
+    hyperParamSearch = "random",
+    randomSample = 1
+  )
   attr(resnetSettings, "settings")$name <- "defaultResnet"
   return(resnetSettings)
 }
@@ -83,12 +87,14 @@ setResNet <- function(numLayers = c(1:8),
                       hiddenDropout = c(seq(0, 0.5, 0.05)),
                       sizeEmbedding = c(2^(6:9)),
                       estimatorSettings =
-                        setEstimator(learningRate = "auto",
-                                     weightDecay = c(1e-6, 1e-3),
-                                     device = "cpu",
-                                     batchSize = 1024,
-                                     epochs = 30,
-                                     seed = NULL),
+                        setEstimator(
+                          learningRate = "auto",
+                          weightDecay = c(1e-6, 1e-3),
+                          device = "cpu",
+                          batchSize = 1024,
+                          epochs = 30,
+                          seed = NULL
+                        ),
                       hyperParamSearch = "random",
                       randomSample = 100,
                       randomSampleSeed = NULL) {
@@ -114,39 +120,46 @@ setResNet <- function(numLayers = c(1:8),
 
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
 
-  paramGrid <- list(numLayers = numLayers,
-                    sizeHidden = sizeHidden,
-                    hiddenFactor = hiddenFactor,
-                    residualDropout = residualDropout,
-                    hiddenDropout = hiddenDropout,
-                    sizeEmbedding = sizeEmbedding)
+  paramGrid <- list(
+    numLayers = numLayers,
+    sizeHidden = sizeHidden,
+    hiddenFactor = hiddenFactor,
+    residualDropout = residualDropout,
+    hiddenDropout = hiddenDropout,
+    sizeEmbedding = sizeEmbedding
+  )
 
   paramGrid <- c(paramGrid, estimatorSettings$paramsToTune)
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
   if (hyperParamSearch == "random" && randomSample > length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of 
+    stop(paste(
+      "\n Chosen amount of randomSamples is higher than the amount of
                possible hyperparameter combinations.", "\n randomSample:",
-               randomSample, "\n Possible hyperparameter combinations:",
-               length(param), "\n Please lower the amount of randomSamples"))
+      randomSample, "\n Possible hyperparameter combinations:",
+      length(param), "\n Please lower the amount of randomSamples"
+    ))
   }
 
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed,
-                                      {param <- param[sample(length(param),
-                                                             randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed, {
+      param <- param[sample(
+        length(param),
+        randomSample
+      )]
+    }))
   }
-  attr(param, "settings")$modelType <- "ResNet"
   results <- list(
-    fitFunction = "fitEstimator",
+    fitFunction = "DeepPatientLevelPrediction::fitEstimator",
     param = param,
     estimatorSettings = estimatorSettings,
-    modelType = "ResNet",
     saveType = "file",
     modelParamNames = c("numLayers", "sizeHidden", "hiddenFactor",
-                        "residualDropout", "hiddenDropout", "sizeEmbedding")
+                        "residualDropout", "hiddenDropout", "sizeEmbedding"),
+    modelType = "ResNet"
   )
+  attr(results$param, "settings")$modelType <- results$modelType
 
   class(results) <- "modelSettings"
 
diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R
index fe7dfc7..f56865e 100644
--- a/R/TrainingCache-class.R
+++ b/R/TrainingCache-class.R
@@ -69,10 +69,12 @@ trainingCache <- R6::R6Class(
     #' Check if cache is full
     #' @returns Boolen
     isFull = function() {
-      return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions,
-                               function(x) !is.null(x$gridPerformance)))))
+      return(all(unlist(lapply(
+        private$.paramPersistence$gridSearchPredictions,
+        function(x) !is.null(x$gridPerformance)
+      ))))
     },
-    
+
     #' @description
     #' Gets the last index from the cached grid search
     #' @returns Last grid search index
@@ -84,8 +86,10 @@ trainingCache <- R6::R6Class(
         if (length(private$.paramPersistence$gridSearchPredictions) == 1) {
           return(1)
         } else {
-          return(which(sapply(private$.paramPersistence$gridSearchPredictions,
-                              is.null))[1])
+          return(which(sapply(
+            private$.paramPersistence$gridSearchPredictions,
+            is.null
+          ))[1])
         }
       }
     },
@@ -94,6 +98,47 @@ trainingCache <- R6::R6Class(
     #' Remove the training cache from the analysis path
     dropCache = function() {
       # TODO
+    },
+    
+    #' @description
+    #' Trims the performance of the hyperparameter results by removing 
+    #' the predictions from all but the best performing hyperparameter
+    #' @param hyperparameterResults List of hyperparameter results 
+    trimPerformance = function(hyperparameterResults) {
+      indexOfMax <-
+        which.max(unlist(
+          lapply(hyperparameterResults,
+                 function(x)
+                   x$gridPerformance$cvPerformance)
+        ))
+      if (length(indexOfMax) != 0) {
+        for (i in seq_along(hyperparameterResults)) {
+          if (!is.null(hyperparameterResults[[i]]) && i != indexOfMax) {
+            hyperparameterResults[[i]]$prediction <- list(NULL)
+          }
+        }
+        ParallelLogger::logInfo(
+          paste0(
+            "Caching all grid search results and
+                                     prediction for best combination ",
+            indexOfMax
+          )
+        )
+      }
+      return(hyperparameterResults)
     }
   )
 )
+
+setupCache <- function(analysisPath, parameters) {
+  trainCache <- trainingCache$new(analysisPath)
+  if (trainCache$isParamGridIdentical(parameters)) {
+    hyperparameterResults <- trainCache$getGridSearchPredictions()
+  } else {
+    hyperparameterResults <- list()
+    length(hyperparameterResults) <- length(parameters)
+    trainCache$saveGridSearchPredictions(hyperparameterResults)
+    trainCache$saveModelParams(parameters)
+  }
+  return(trainCache)
+}
\ No newline at end of file
diff --git a/R/TransferLearning.R b/R/TransferLearning.R
new file mode 100644
index 0000000..95d1c04
--- /dev/null
+++ b/R/TransferLearning.R
@@ -0,0 +1,62 @@
+# @file TransferLearning.R
+#
+# Copyright 2023 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' setFinetuner
+#'
+#' @description
+#' creates settings for using transfer learning to finetune a model
+#'
+#' @name setFinetuner
+#' @param modelPath path to existing plpModel directory
+#' @param estimatorSettings settings created with `setEstimator`
+#' @export
+setFinetuner <- function(modelPath,
+                         estimatorSettings = setEstimator()) {
+
+  if (!dir.exists(modelPath)) {
+    stop(paste0("supplied modelPath does not exist, you supplied: modelPath = ",
+                modelPath))
+  }
+  # TODO check if it's a valid path to a plpModel
+  if (!dir.exists(file.path(modelPath, "model"))) {
+    stop(paste0("supplied modelPath does not contain a model directory, you supplied: modelPath = ",
+                modelPath))
+  }
+  if (!file.exists(file.path(modelPath, "model", "DeepEstimatorModel.pt"))) {
+    stop(paste0("supplied modelPath does not contain a model file, you supplied: modelPath = ",
+                modelPath))
+  }
+  
+  
+  param <- list()
+  param[[1]] <- list(modelPath = modelPath)
+
+  results <- list(
+    fitFunction = "fitEstimator",
+    param = param,
+    estimatorSettings = estimatorSettings,
+    saveType = "file",
+    modelParamNames = c("modelPath"),
+    modelType = "Finetuner"
+  )
+  attr(results$param, "settings")$modelType <- results$modelType
+
+  class(results) <- "modelSettings"
+
+  return(results)
+}
diff --git a/R/Transformer.R b/R/Transformer.R
index cd4a968..8cb737f 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -25,24 +25,27 @@
 #'
 #' @export
 setDefaultTransformer <- function(estimatorSettings =
-    setEstimator(learningRate = "auto",
-                 weightDecay = 1e-4,
-                 batchSize = 512,
-                 epochs = 10,
-                 seed = NULL,
-                 device = "cpu")
-) {
-  transformerSettings <- setTransformer(numBlocks = 3,
-                                        dimToken = 192,
-                                        dimOut = 1,
-                                        numHeads = 8,
-                                        attDropout = 0.2,
-                                        ffnDropout = 0.1,
-                                        resDropout = 0.0,
-                                        dimHidden = 256,
-                                        estimatorSettings = estimatorSettings,
-                                        hyperParamSearch = "random",
-                                        randomSample = 1)
+                                    setEstimator(
+                                      learningRate = "auto",
+                                      weightDecay = 1e-4,
+                                      batchSize = 512,
+                                      epochs = 10,
+                                      seed = NULL,
+                                      device = "cpu"
+                                    )) {
+  transformerSettings <- setTransformer(
+    numBlocks = 3,
+    dimToken = 192,
+    dimOut = 1,
+    numHeads = 8,
+    attDropout = 0.2,
+    ffnDropout = 0.1,
+    resDropout = 0.0,
+    dimHidden = 256,
+    estimatorSettings = estimatorSettings,
+    hyperParamSearch = "random",
+    randomSample = 1
+  )
   attr(transformerSettings, "settings")$name <- "defaultTransformer"
   return(transformerSettings)
 }
@@ -73,22 +76,23 @@ setDefaultTransformer <- function(estimatorSettings =
 #'
 #' @export
 setTransformer <- function(numBlocks = 3,
-                           dimToken = 96,
+                           dimToken = 192,
                            dimOut = 1,
                            numHeads = 8,
-                           attDropout = 0.25,
-                           ffnDropout = 0.25,
+                           attDropout = 0.2,
+                           ffnDropout = 0.1,
                            resDropout = 0,
-                           dimHidden = 512,
+                           dimHidden = 256,
                            dimHiddenRatio = NULL,
-                           estimatorSettings = setEstimator(weightDecay = 1e-6,
-                                                            batchSize = 1024,
-                                                            epochs = 10,
-                                                            seed = NULL),
+                           estimatorSettings = setEstimator(
+                             weightDecay = 1e-6,
+                             batchSize = 1024,
+                             epochs = 10,
+                             seed = NULL
+                           ),
                            hyperParamSearch = "random",
                            randomSample = 1,
                            randomSampleSeed = NULL) {
-
   checkIsClass(numBlocks, c("integer", "numeric"))
   checkHigherEqual(numBlocks, 1)
 
@@ -127,23 +131,23 @@ setTransformer <- function(numBlocks = 3,
 
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
 
-  if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads),
-               dimToken %% numHeads != 0))) {
+  if (any(with(
+    expand.grid(dimToken = dimToken, numHeads = numHeads),
+    dimToken %% numHeads != 0
+  ))) {
     stop(paste(
       "dimToken needs to divisible by numHeads. dimToken =", dimToken,
       "is not divisible by numHeads =", numHeads
     ))
   }
 
-  if (is.null(dimHidden) && is.null(dimHiddenRatio)
-      || !is.null(dimHidden) && !is.null(dimHiddenRatio)) {
+  if (is.null(dimHidden) && is.null(dimHiddenRatio) ||
+    !is.null(dimHidden) && !is.null(dimHiddenRatio)) {
     stop(paste(
       "dimHidden and dimHiddenRatio cannot be both set or both NULL"
     ))
-  } else {
-    if (!is.null(dimHiddenRatio)) {
-      dimHidden <- dimHiddenRatio
-    }
+  } else if (!is.null(dimHiddenRatio)) {
+    dimHidden <- dimHiddenRatio
   }
 
   paramGrid <- list(
@@ -169,30 +173,34 @@ setTransformer <- function(numBlocks = 3,
   }
 
   if (hyperParamSearch == "random" && randomSample > length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of
+    stop(paste(
+      "\n Chosen amount of randomSamples is higher than the amount of
                possible hyperparameter combinations.", "\n randomSample:",
-               randomSample, "\n Possible hyperparameter combinations:",
-               length(param), "\n Please lower the amount of randomSample"))
+      randomSample, "\n Possible hyperparameter combinations:",
+      length(param), "\n Please lower the amount of randomSample"
+    ))
   }
 
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed,
-                                      {param <- param[sample(length(param),
-                                                             randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed, {
+      param <- param[sample(
+        length(param),
+        randomSample
+      )]
+    }))
   }
-  attr(param, "settings")$modelType <- "Transformer"
   results <- list(
-    fitFunction = "fitEstimator",
+    fitFunction = "DeepPatientLevelPrediction::fitEstimator",
     param = param,
     estimatorSettings = estimatorSettings,
-    modelType = "Transformer",
     saveType = "file",
     modelParamNames = c(
       "numBlocks", "dimToken", "dimOut", "numHeads",
       "attDropout", "ffnDropout", "resDropout", "dimHidden"
-    )
+    ),
+    modelType = "Transformer"
   )
-
+  attr(results$param, "settings")$modelType <- results$modelType
   class(results) <- "modelSettings"
   return(results)
 }
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 411f5c7..2f5fbb5 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -33,6 +33,9 @@ navbar:
     firstModel:
       text: My first deep learning model
       href: articles/FirstModel.html
+    transferlearning:
+      text: Transfer learning
+      href: articles/TransferLearning.html
     news:
       text: Changelog
       href: news/index.html
diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py
index 6d105b8..ed3c3bd 100644
--- a/inst/python/Dataset.py
+++ b/inst/python/Dataset.py
@@ -1,6 +1,6 @@
 import time
 import pathlib
-import urllib
+from urllib.parse import quote
 
 import polars as pl
 import torch
@@ -16,9 +16,9 @@ def __init__(self, data, labels=None, numerical_features=None):
         """
         start = time.time()
         if pathlib.Path(data).suffix == ".sqlite":
-            data = urllib.parse.quote(data)
-            data = pl.read_database(
-                "SELECT * from covariates", connection=f"sqlite://{data}"
+            data = quote(data)
+            data = pl.read_database_uri(
+                "SELECT * from covariates", uri=f"sqlite://{data}"
             ).lazy()
         else:
             data = pl.scan_ipc(pathlib.Path(data).joinpath("covariates/*.arrow"))
@@ -26,10 +26,11 @@ def __init__(self, data, labels=None, numerical_features=None):
         # detect features are numeric
         if numerical_features is None:
             self.numerical_features = (
-                data.groupby(by="columnId")
+                data.group_by("columnId")
                 .n_unique()
                 .filter(pl.col("covariateValue") > 1)
                 .select("columnId")
+                .sort("columnId")
                 .collect()["columnId"]
             )
         else:
@@ -41,10 +42,7 @@ def __init__(self, data, labels=None, numerical_features=None):
             self.target = torch.zeros(size=(observations,))
 
         # filter by categorical columns,
-        # sort and group_by columnId
-        # create newColumnId from 1 (or zero?) until # catColumns
-        # select rowId and newColumnId
-        # rename newColumnId to columnId and sort by it
+        # select rowId and columnId
         data_cat = (
             data.filter(~pl.col("columnId").is_in(self.numerical_features))
             .select(pl.col("rowId"), pl.col("columnId"))
@@ -52,7 +50,7 @@ def __init__(self, data, labels=None, numerical_features=None):
             .with_columns(pl.col("rowId") - 1)
             .collect()
         )
-        cat_tensor = torch.as_tensor(data_cat.to_numpy())
+        cat_tensor = torch.tensor(data_cat.to_numpy())
         tensor_list = torch.split(
             cat_tensor[:, 1],
             torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].tolist(),
@@ -72,13 +70,19 @@ def __init__(self, data, labels=None, numerical_features=None):
         if self.numerical_features.count() == 0:
             self.num = None
         else:
-            map_numerical = dict(zip(self.numerical_features.sort().to_list(),
-                                     list(range(len(self.numerical_features)))))
+            map_numerical = dict(
+                zip(
+                    self.numerical_features.sort().to_list(),
+                    list(range(len(self.numerical_features))),
+                )
+            )
 
             numerical_data = (
                 data.filter(pl.col("columnId").is_in(self.numerical_features))
-                .with_columns(pl.col("columnId").replace(map_numerical),
-                              pl.col("rowId") - 1)
+                .sort("columnId")
+                .with_columns(
+                    pl.col("columnId").replace(map_numerical), pl.col("rowId") - 1
+                )
                 .select(
                     pl.col("rowId"),
                     pl.col("columnId"),
@@ -90,7 +94,7 @@ def __init__(self, data, labels=None, numerical_features=None):
                 numerical_data.select(["rowId", "columnId"]).to_numpy(),
                 dtype=torch.long,
             )
-            values = torch.as_tensor(
+            values = torch.tensor(
                 numerical_data.select("covariateValue").to_numpy(), dtype=torch.float
             )
             self.num = torch.sparse_coo_tensor(
@@ -115,7 +119,7 @@ def __getitem__(self, item):
             batch = {"cat": self.cat[item, :], "num": self.num[item, :]}
         else:
             batch = {"cat": self.cat[item, :].squeeze(), "num": None}
-        if batch["cat"].dim() == 1 and not isinstance(item, list):
+        if batch["cat"].dim() == 1:
             batch["cat"] = batch["cat"].unsqueeze(0)
         if (
             batch["num"] is not None
@@ -124,3 +128,8 @@ def __getitem__(self, item):
         ):
             batch["num"] = batch["num"].unsqueeze(0)
         return [batch, self.target[item].squeeze()]
+
+
+
+
+
diff --git a/inst/python/Estimator.py b/inst/python/Estimator.py
index 983bde6..f0e020d 100644
--- a/inst/python/Estimator.py
+++ b/inst/python/Estimator.py
@@ -3,9 +3,10 @@
 
 import torch
 from torch.utils.data import DataLoader, BatchSampler, RandomSampler, SequentialSampler
-import torch.nn.functional as F
 from tqdm import tqdm
 
+from gpu_memory_cleanup import memory_cleanup
+
 
 class Estimator:
     """
@@ -18,18 +19,37 @@ def __init__(self, model, model_parameters, estimator_settings):
             self.device = estimator_settings["device"]()
         else:
             self.device = estimator_settings["device"]
-
         torch.manual_seed(seed=self.seed)
-        self.model = model(**model_parameters)
+        if "finetune" in estimator_settings.keys() and estimator_settings["finetune"]:
+            path = estimator_settings["finetune_model_path"]
+            fitted_estimator = torch.load(path, map_location="cpu")
+            fitted_parameters = fitted_estimator["model_parameters"]
+            self.model = model(**fitted_parameters)
+            self.model.load_state_dict(fitted_estimator["model_state_dict"])
+            for param in self.model.parameters():
+                param.requires_grad = False
+            self.model.reset_head()
+        else:
+            self.model = model(**model_parameters)
         self.model_parameters = model_parameters
         self.estimator_settings = estimator_settings
 
         self.epochs = int(estimator_settings.get("epochs", 5))
-        self.learning_rate = estimator_settings.get("learning_rate", 3e-4)
+        if estimator_settings["find_l_r"]:
+            self.learning_rate = 3e-4
+        else:
+            self.learning_rate = estimator_settings.get("learning_rate", 3e-4)
         self.weight_decay = estimator_settings.get("weight_decay", 1e-5)
         self.batch_size = int(estimator_settings.get("batch_size", 1024))
         self.prefix = estimator_settings.get("prefix", self.model.name)
-
+        
+        if "accumulation_steps" in estimator_settings.keys() and estimator_settings["accumulation_steps"]:
+            self.accumulation_steps = int(estimator_settings["accumulation_steps"])
+            self.sub_batch_size = self.batch_size // self.accumulation_steps
+        else:
+            self.accumulation_steps = 1
+            self.sub_batch_size = self.batch_size
+        
         self.previous_epochs = int(estimator_settings.get("previous_epochs", 0))
         self.model.to(device=self.device)
 
@@ -38,7 +58,7 @@ def __init__(self, model, model_parameters, estimator_settings):
             lr=self.learning_rate,
             weight_decay=self.weight_decay,
         )
-        self.criterion = estimator_settings["criterion"]()
+        self.criterion = estimator_settings["criterion"](reduction="sum")
 
         if (
             "metric" in estimator_settings.keys()
@@ -151,15 +171,22 @@ def fit_epoch(self, dataloader):
         training_losses = torch.empty(len(dataloader))
         self.model.train()
         index = 0
+        self.optimizer.zero_grad()
         for batch in tqdm(dataloader):
-            self.optimizer.zero_grad()
-            batch = batch_to_device(batch, device=self.device)
-            out = self.model(batch[0])
-            loss = self.criterion(out, batch[1])
-            loss.backward()
-
+            split_batch = self.split_batch(batch)
+            accumulated_loss = 0
+            all_out = []
+            for sub_batch in split_batch:
+                sub_batch = batch_to_device(sub_batch, device=self.device)
+                out = self.model(sub_batch[0])
+                all_out.append(out.detach())
+                loss = self.criterion(out.squeeze(), sub_batch[1])
+                loss.backward()
+                accumulated_loss += loss.detach()
+            
             self.optimizer.step()
-            training_losses[index] = loss.detach()
+            self.optimizer.zero_grad()
+            training_losses[index] = accumulated_loss / self.batch_size
             index += 1
         return training_losses.mean().item()
 
@@ -171,11 +198,16 @@ def score(self, dataloader):
             self.model.eval()
             index = 0
             for batch in tqdm(dataloader):
-                batch = batch_to_device(batch, device=self.device)
-                pred = self.model(batch[0])
-                predictions.append(pred)
-                targets.append(batch[1])
-                loss[index] = self.criterion(pred, batch[1])
+                split_batch = self.split_batch(batch)
+                accumulated_loss = 0
+                for sub_batch in split_batch:
+                    sub_batch = batch_to_device(sub_batch, device=self.device)
+                    pred = self.model(sub_batch[0])
+                    predictions.append(pred)
+                    targets.append(sub_batch[1])
+                    accumulated_loss += self.criterion(pred.squeeze(), sub_batch[1]).detach()
+                loss[index] = accumulated_loss / self.batch_size
+
                 index += 1
             mean_loss = loss.mean().item()
             predictions = torch.concat(predictions)
@@ -248,6 +280,22 @@ def print_progress(self, scores, training_loss, delta_time, current_epoch):
             )
         return
 
+    def split_batch(self, batch):
+        if self.accumulation_steps > 1 and len(batch[0]["cat"]) > self.sub_batch_size:
+            data, labels = batch
+            split_data = {key: list(torch.split(value, self.sub_batch_size))
+                          for key, value in data.items() if value is not None}
+            split_labels = list(torch.split(labels, self.sub_batch_size))
+
+            sub_batches = []
+            for i in range(len(split_labels)):
+                sub_batch = {key: value[i] for key, value in split_data.items()}
+                sub_batch = [sub_batch, split_labels[i]]
+                sub_batches.append(sub_batch)
+        else:
+            sub_batches = [batch]
+        return sub_batches
+
     def fit_whole_training_set(self, dataset, learning_rates=None):
         dataloader = DataLoader(
             dataset=dataset,
@@ -296,9 +344,11 @@ def predict_proba(self, dataset):
             predictions = list()
             self.model.eval()
             for batch in tqdm(dataloader):
-                batch = batch_to_device(batch, device=self.device)
-                pred = self.model(batch[0])
-                predictions.append(torch.sigmoid(pred))
+                split_batch = self.split_batch(batch)
+                for sub_batch in split_batch:
+                    sub_batch = batch_to_device(sub_batch, device=self.device)
+                    pred = self.model(sub_batch[0])
+                    predictions.append(torch.sigmoid(pred))
             predictions = torch.concat(predictions).cpu().numpy()
         return predictions
 
@@ -395,3 +445,12 @@ def compute_auc(y_true, y_pred):
     # Compute AUC
     auc = num_crossings / (n_pos * n_neg)
     return auc
+
+
+def fit_estimator(estimator, train, test):
+    try:
+        estimator.fit(train, test)
+    except torch.cuda.OutOfMemoryError as e:
+        memory_cleanup()
+        raise e
+    return estimator
diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py
index e7141cd..ebd088f 100644
--- a/inst/python/LrFinder.py
+++ b/inst/python/LrFinder.py
@@ -1,3 +1,4 @@
+from os import walk
 import random
 
 import torch
@@ -5,6 +6,7 @@
 from tqdm import tqdm
 
 from Estimator import batch_to_device
+from gpu_memory_cleanup import memory_cleanup
 
 
 class ExponentialSchedulerPerBatch(_LRScheduler):
@@ -19,71 +21,70 @@ def get_lr(self):
 
 
 class LrFinder:
-    def __init__(self, model, model_parameters, estimator_settings, lr_settings):
+    def __init__(self, estimator, lr_settings=None):
+        self.first_batch = None
         if lr_settings is None:
             lr_settings = {}
-        min_lr = lr_settings.get("min_lr", 1e-7)
-        max_lr = lr_settings.get("max_lr", 1)
-        num_lr = lr_settings.get("num_lr", 100)
-        smooth = lr_settings.get("smooth", 0.05)
-        divergence_threshold = lr_settings.get("divergence_threshold", 4)
-        torch.manual_seed(seed=estimator_settings["seed"])
-        self.seed = estimator_settings["seed"]
-        self.model = model(**model_parameters)
-        if callable(estimator_settings["device"]):
-            self.device = estimator_settings["device"]()
+        self.min_lr = lr_settings.get("min_lr", 1e-7)
+        self.max_lr = lr_settings.get("max_lr", 1)
+        self.num_lr = lr_settings.get("num_lr", 100)
+        self.smooth = lr_settings.get("smooth", 0.05)
+        self.divergence_threshold = lr_settings.get("divergence_threshold", 4)
+        torch.manual_seed(seed=estimator.seed)
+        self.seed = estimator.seed
+        if self.num_lr < 20:
+            self.min_factor = 0
         else:
-            self.device = estimator_settings["device"]
-        self.model.to(device=self.device)
-        self.min_lr = min_lr
-        self.max_lr = max_lr
-        self.num_lr = num_lr
-        self.smooth = smooth
-        self.divergence_threshold = divergence_threshold
-
-        self.optimizer = estimator_settings["optimizer"](
-            params=self.model.parameters(), lr=self.min_lr
-        )
+            self.min_factor = 5
 
-        self.scheduler = ExponentialSchedulerPerBatch(
-            self.optimizer, self.max_lr, self.num_lr
-        )
+        for group in estimator.optimizer.param_groups:
+            group['lr'] = self.min_lr
 
-        self.criterion = estimator_settings["criterion"]()
-        self.batch_size = int(estimator_settings["batch_size"])
+        estimator.scheduler = ExponentialSchedulerPerBatch(
+            estimator.optimizer, self.max_lr, self.num_lr
+        )
+        if estimator.accumulation_steps > 1:
+            self.accumulation_steps = estimator.accumulation_steps
+        else:
+            self.accumulation_steps = 1
+        self.estimator = estimator
         self.losses = None
         self.loss_index = None
 
     def get_lr(self, dataset):
+        if len(dataset) < self.estimator.batch_size:
+            self.estimator.batch_size = len(dataset)
         batch_index = torch.arange(0, len(dataset), 1).tolist()
         random.seed(self.seed)
         losses = torch.empty(size=(self.num_lr,), dtype=torch.float)
         lrs = torch.empty(size=(self.num_lr,), dtype=torch.float)
+        self.estimator.optimizer.zero_grad()
+        best_loss = float("inf")
         for i in tqdm(range(self.num_lr)):
-            self.optimizer.zero_grad()
-            random_batch = random.sample(batch_index, self.batch_size)
-            batch = dataset[random_batch]
-            batch = batch_to_device(batch, self.device)
-
-            out = self.model(batch[0])
-            loss = self.criterion(out, batch[1])
+            loss_value = 0
+            random_batch = random.sample(batch_index, self.estimator.batch_size)
+            for j in range(self.accumulation_steps):
+                batch = dataset[random_batch[j * self.estimator.sub_batch_size : (j + 1) * self.estimator.sub_batch_size]]
+                batch = batch_to_device(batch, self.estimator.device)
+
+                out = self.estimator.model(batch[0])
+                loss = self.estimator.criterion(out, batch[1])
+                loss.backward()
+                loss_value += loss.item()
+            loss_value = loss_value / self.accumulation_steps
             if self.smooth is not None and i != 0:
                 losses[i] = (
-                    self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1]
+                    self.smooth * loss_value + (1 - self.smooth) * losses[i - 1]
                 )
             else:
-                losses[i] = loss.item()
-            lrs[i] = self.optimizer.param_groups[0]["lr"]
+                losses[i] = loss_value
+            lrs[i] = self.estimator.optimizer.param_groups[0]["lr"]
 
-            loss.backward()
-            self.optimizer.step()
-            self.scheduler.step()
+            self.estimator.optimizer.step()
+            self.estimator.scheduler.step()
 
-            if i == 0:
+            if losses[i] < best_loss:
                 best_loss = losses[i]
-            else:
-                if losses[i] < best_loss:
-                    best_loss = losses[i]
 
             if losses[i] > (self.divergence_threshold * best_loss):
                 print(
@@ -93,12 +94,29 @@ def get_lr(self, dataset):
 
         # find LR where gradient is highest but before global minimum is reached
         # I added -5 to make sure it is not still in the minimum
-        global_minimum = torch.argmin(losses)
-        gradient = torch.diff(losses[: (global_minimum - 5) + 1])
-        smallest_gradient = torch.argmin(gradient)
+        global_minimum = torch.argmin(losses[:i + 1])
+        if global_minimum == 0:
+            biggest_gradient = 0
+        else:
+            gradient = torch.diff(losses[: (global_minimum - self.min_factor) + 1])
+            biggest_gradient = torch.argmax(gradient)
 
-        suggested_lr = lrs[smallest_gradient]
-        self.losses = losses
-        self.loss_index = smallest_gradient
+        suggested_lr = lrs[biggest_gradient]
+        self.losses = losses[:i]
+        self.loss_index = biggest_gradient
         self.lrs = lrs
         return suggested_lr.item()
+
+
+def get_lr(estimator,
+           dataset,
+           lr_settings):
+    try:
+        lr_finder = LrFinder(estimator, lr_settings=lr_settings)
+        lr = lr_finder.get_lr(dataset)
+    except torch.cuda.OutOfMemoryError as e:
+        memory_cleanup()
+        raise e
+    return lr
+
+
diff --git a/inst/python/MLP.py b/inst/python/MultiLayerPerceptron.py
similarity index 80%
rename from inst/python/MLP.py
rename to inst/python/MultiLayerPerceptron.py
index cd049a6..7ff1539 100644
--- a/inst/python/MLP.py
+++ b/inst/python/MultiLayerPerceptron.py
@@ -3,7 +3,7 @@
 from ResNet import NumericalEmbedding
 
 
-class MLP(nn.Module):
+class MultiLayerPerceptron(nn.Module):
     def __init__(
         self,
         cat_features: int,
@@ -13,17 +13,18 @@ def __init__(
         num_layers: int,
         activation=nn.ReLU,
         normalization=nn.BatchNorm1d,
-        dropout=None,
-        d_out: int = 1,
+        dropout=0.0,
+        dim_out: int = 1,
+        model_type="MultiLayerPerceptron"
     ):
-        super(MLP, self).__init__()
-        self.name = "MLP"
+        super(MultiLayerPerceptron, self).__init__()
+        self.name = model_type
         cat_features = int(cat_features)
         num_features = int(num_features)
         size_embedding = int(size_embedding)
         size_hidden = int(size_hidden)
         num_layers = int(num_layers)
-        d_out = int(d_out)
+        dim_out = int(dim_out)
 
         self.embedding = nn.EmbeddingBag(
             cat_features + 1, size_embedding, padding_idx=0
@@ -35,7 +36,7 @@ def __init__(
         self.first_layer = nn.Linear(size_embedding, size_hidden)
 
         self.layers = nn.ModuleList(
-            MLPLayer(
+            MlpLayer(
                 size_hidden=size_hidden,
                 normalization=normalization,
                 activation=activation,
@@ -44,7 +45,9 @@ def __init__(
             for _ in range(num_layers)
         )
         self.last_norm = normalization(size_hidden)
-        self.head = nn.Linear(size_hidden, d_out)
+        self.head = nn.Linear(size_hidden, dim_out)
+        self.size_hidden = size_hidden
+        self.dim_out = dim_out
 
         self.last_act = activation()
 
@@ -65,8 +68,11 @@ def forward(self, input):
         x = x.squeeze(-1)
         return x
 
+    def reset_head(self):
+        self.head = nn.Linear(self.size_hidden, self.dim_out)
+
 
-class MLPLayer(nn.Module):
+class MlpLayer(nn.Module):
     def __init__(
         self,
         size_hidden=64,
@@ -75,7 +81,7 @@ def __init__(
         dropout=0.0,
         bias=True,
     ):
-        super(MLPLayer, self).__init__()
+        super(MlpLayer, self).__init__()
         self.norm = normalization(size_hidden)
         self.activation = activation()
         self.linear = nn.Linear(size_hidden, size_hidden, bias=bias)
diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py
index 9df35d8..7d60410 100644
--- a/inst/python/ResNet.py
+++ b/inst/python/ResNet.py
@@ -19,9 +19,10 @@ def __init__(
         residual_dropout=0,
         dim_out: int = 1,
         concat_num=True,
+        model_type="ResNet"
     ):
         super(ResNet, self).__init__()
-        self.name = "ResNet"
+        self.name = model_type
         cat_features = int(cat_features)
         num_features = int(num_features)
         size_embedding = int(size_embedding)
@@ -58,6 +59,8 @@ def __init__(
         self.last_norm = normalization(size_hidden)
 
         self.head = nn.Linear(size_hidden, dim_out)
+        self.size_hidden = size_hidden
+        self.dim_out = dim_out
 
         self.last_act = activation()
 
@@ -87,6 +90,9 @@ def forward(self, x):
         x = x.squeeze(-1)
         return x
 
+    def reset_head(self):
+        self.head = nn.Linear(self.size_hidden, self.dim_out)
+
 
 class ResLayer(nn.Module):
     def __init__(
diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py
index ddbe929..ec3707a 100644
--- a/inst/python/Transformer.py
+++ b/inst/python/Transformer.py
@@ -35,9 +35,10 @@ def __init__(
         ffn_norm=nn.LayerNorm,
         head_norm=nn.LayerNorm,
         att_norm=nn.LayerNorm,
+        model_type="Transformer"
     ):
         super(Transformer, self).__init__()
-        self.name = "Transformer"
+        self.name = model_type
         cat_features = int(cat_features)
         num_features = int(num_features)
         num_blocks = int(num_blocks)
@@ -88,6 +89,10 @@ def __init__(
             normalization=head_norm,
             dim_out=dim_out,
         )
+        self.dim_token = dim_token
+        self.head_activation = head_activation
+        self.head_normalization = head_norm
+        self.dim_out = dim_out
 
     def forward(self, x):
         mask = torch.where(x["cat"] == 0, True, False)
@@ -141,6 +146,15 @@ def forward(self, x):
         x = self.head(x)[:, 0]
         return x
 
+    def reset_head(self):
+        self.head = Head(
+            self.dim_token,
+            bias=True,
+            activation=self.head_activation,
+            normalization=self.head_normalization,
+            dim_out=self.dim_out
+        )
+
     @staticmethod
     def start_residual(layer, stage, x):
         norm = f"{stage}_norm"
diff --git a/inst/python/gpu_memory_cleanup.py b/inst/python/gpu_memory_cleanup.py
new file mode 100644
index 0000000..21f3944
--- /dev/null
+++ b/inst/python/gpu_memory_cleanup.py
@@ -0,0 +1,67 @@
+import gc
+import contextlib
+
+import torch
+import pynvml
+
+
+def memory_cleanup():
+    try:
+        gc.collect()
+        torch.cuda.empty_cache()
+        return
+    finally:
+        gc.collect()
+        torch.cuda.empty_cache()
+        if (mem := gpu_memory_use()) > 3.0:
+            print("GPU memory usage still high")
+            cnt = 0
+            with contextlib.redirect_stderr(None):
+                for obj in get_tensors():
+                    obj.detach()
+                    obj.grad = None
+                    obj.storage().resize_(0)
+                    cnt += 1
+            gc.collect()
+            torch.cuda.empty_cache()
+            usage, cache, misc = gpu_memory_usage_all()
+            print(
+                f"  forcibly cleared {cnt} tensors: {mem:.03f}GB -> {usage:.03f}GB (+{cache:.03f}GB cache, +{misc:.03f}GB misc)"
+            )
+
+
+def get_tensors(gpu_only=True):
+    for obj in gc.get_objects():
+        try:
+            if torch.is_tensor(obj):
+                tensor = obj
+            elif hasattr(obj, "data") and torch.is_tensor(obj.data):
+                tensor = obj.data
+            else:
+                continue
+            if tensor.is_cuda or not gpu_only:
+                yield tensor
+        except Exception:
+            continue
+
+
+def gpu_memory_use(device=0):
+    return torch.cuda.memory_allocated(device) / 1024.0 ** 3
+
+
+def gpu_memory_usage_all(device=0):
+    usage = torch.cuda.memory_allocated() / 1024.0 ** 3
+    reserved = torch.cuda.memory_reserved() / 1024.0 ** 3
+    smi = gpu_memory_usage_smi(device)
+    return usage, reserved - usage, max(0, smi - reserved)
+
+
+def gpu_memory_usage_smi(device=0):
+    if isinstance(device, torch.device):
+        device = device.index
+    if isinstance(device, str) and device.startswith("cuda"):
+        device = int(device[5:])
+    pynvml.nvmlInit()
+    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
+    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+    return info.used / 1024.0 ** 3
diff --git a/man/DeepPatientLevelPrediction.Rd b/man/DeepPatientLevelPrediction.Rd
index 9eb3b36..c1ad0b9 100644
--- a/man/DeepPatientLevelPrediction.Rd
+++ b/man/DeepPatientLevelPrediction.Rd
@@ -2,9 +2,30 @@
 % Please edit documentation in R/DeepPatientLevelPrediction.R
 \docType{package}
 \name{DeepPatientLevelPrediction}
+\alias{DeepPatientLevelPrediction-package}
 \alias{DeepPatientLevelPrediction}
 \title{DeepPatientLevelPrediction}
 \description{
 A package containing deep learning extensions for developing
 prediction models using data in the OMOP CDM
 }
+\seealso{
+Useful links:
+\itemize{
+  \item \url{https://github.com/OHDSI/DeepPatientLevelPrediction}
+  \item Report bugs at \url{https://github.com/OHDSI/DeepPatientLevelPrediction/issues}
+}
+
+}
+\author{
+\strong{Maintainer}: Egill Fridgeirsson \email{e.fridgeirsson@erasmusmc.nl}
+
+Authors:
+\itemize{
+  \item Jenna Reps \email{jreps@its.jnj.com}
+  \item Seng Chan You
+  \item Chungsoo Kim
+  \item Henrik John
+}
+
+}
diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd
index b8424a3..d9dd3d8 100644
--- a/man/setEstimator.Rd
+++ b/man/setEstimator.Rd
@@ -16,6 +16,7 @@ setEstimator(
   criterion = torch$nn$BCEWithLogitsLoss,
   earlyStopping = list(useEarlyStopping = TRUE, params = list(patience = 4)),
   metric = "auc",
+  accumulationSteps = NULL,
   seed = NULL
 )
 }
@@ -29,8 +30,7 @@ setEstimator(
 \item{epochs}{how many epochs to train for}
 
 \item{device}{what device to train on, can be a string or a function to
-that evaluates
-to the device during runtime}
+that evaluates to the device during runtime}
 
 \item{optimizer}{which optimizer to use}
 
@@ -48,6 +48,9 @@ Needs to be a list with function `fun`, mode either `min` or `max` and a
 `fun` needs to be a function that takes in prediction and labels and
 outputs a score.}
 
+\item{accumulationSteps}{how many steps to accumulate gradients before
+updating weights, can also be a function that is evaluated during runtime}
+
 \item{seed}{seed to initialize weights of model with}
 }
 \description{
diff --git a/man/setFinetuner.Rd b/man/setFinetuner.Rd
new file mode 100644
index 0000000..f439350
--- /dev/null
+++ b/man/setFinetuner.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/TransferLearning.R
+\name{setFinetuner}
+\alias{setFinetuner}
+\title{setFinetuner}
+\usage{
+setFinetuner(modelPath, estimatorSettings = setEstimator())
+}
+\arguments{
+\item{modelPath}{path to existing plpModel directory}
+
+\item{estimatorSettings}{settings created with `setEstimator`}
+}
+\description{
+creates settings for using transfer learning to finetune a model
+}
diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd
index 9afcbd5..4a68b1e 100644
--- a/man/setTransformer.Rd
+++ b/man/setTransformer.Rd
@@ -6,13 +6,13 @@
 \usage{
 setTransformer(
   numBlocks = 3,
-  dimToken = 96,
+  dimToken = 192,
   dimOut = 1,
   numHeads = 8,
-  attDropout = 0.25,
-  ffnDropout = 0.25,
+  attDropout = 0.2,
+  ffnDropout = 0.1,
   resDropout = 0,
-  dimHidden = 512,
+  dimHidden = 256,
   dimHiddenRatio = NULL,
   estimatorSettings = setEstimator(weightDecay = 1e-06, batchSize = 1024, epochs = 10,
     seed = NULL),
diff --git a/man/snakeCaseToCamelCase.Rd b/man/snakeCaseToCamelCase.Rd
new file mode 100644
index 0000000..1be21a8
--- /dev/null
+++ b/man/snakeCaseToCamelCase.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{snakeCaseToCamelCase}
+\alias{snakeCaseToCamelCase}
+\title{Convert a camel case string to snake case}
+\usage{
+snakeCaseToCamelCase(string)
+}
+\arguments{
+\item{string}{The string to be converted}
+}
+\value{
+A string
+}
+\description{
+Convert a camel case string to snake case
+}
diff --git a/man/snakeCaseToCamelCaseNames.Rd b/man/snakeCaseToCamelCaseNames.Rd
new file mode 100644
index 0000000..111dab1
--- /dev/null
+++ b/man/snakeCaseToCamelCaseNames.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{snakeCaseToCamelCaseNames}
+\alias{snakeCaseToCamelCaseNames}
+\title{Convert the names of an object from snake case to camel case}
+\usage{
+snakeCaseToCamelCaseNames(object)
+}
+\arguments{
+\item{object}{The object of which the names should be converted}
+}
+\value{
+The same object, but with converted names.
+}
+\description{
+Convert the names of an object from snake case to camel case
+}
diff --git a/man/torch.Rd b/man/torch.Rd
new file mode 100644
index 0000000..79a3c29
--- /dev/null
+++ b/man/torch.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/DeepPatientLevelPrediction.R
+\docType{data}
+\name{torch}
+\alias{torch}
+\title{Pytorch module}
+\format{
+An object of class `python.builtin.module`
+}
+\value{
+the torch Python module
+}
+\description{
+The `torch` module object is the equivalent of
+`reticulate::import("torch")` and provided mainly as a convenience.
+}
+\keyword{datasets}
diff --git a/man/trainingCache.Rd b/man/trainingCache.Rd
index 77d2efc..8be59fb 100644
--- a/man/trainingCache.Rd
+++ b/man/trainingCache.Rd
@@ -26,6 +26,7 @@ Parameter caching for training persistence and continuity
 \item \href{#method-TrainingCache-isFull}{\code{trainingCache$isFull()}}
 \item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{trainingCache$getLastGridSearchIndex()}}
 \item \href{#method-TrainingCache-dropCache}{\code{trainingCache$dropCache()}}
+\item \href{#method-TrainingCache-trimPerformance}{\code{trainingCache$trimPerformance()}}
 \item \href{#method-TrainingCache-clone}{\code{trainingCache$clone()}}
 }
 }
@@ -137,6 +138,24 @@ Remove the training cache from the analysis path
 \if{html}{\out{<div class="r">}}\preformatted{trainingCache$dropCache()}\if{html}{\out{</div>}}
 }
 
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-TrainingCache-trimPerformance"></a>}}
+\if{latex}{\out{\hypertarget{method-TrainingCache-trimPerformance}{}}}
+\subsection{Method \code{trimPerformance()}}{
+Trims the performance of the hyperparameter results by removing 
+the predictions from all but the best performing hyperparameter
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$trimPerformance(hyperparameterResults)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{hyperparameterResults}}{List of hyperparameter results}
+}
+\if{html}{\out{</div>}}
+}
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-TrainingCache-clone"></a>}}
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index 3e95d1a..c0dfe5a 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -1,6 +1,6 @@
 library(PatientLevelPrediction)
 
-testLoc <- tempdir()
+testLoc <- normalizePath(tempdir())
 torch <- reticulate::import("torch")
 # get connection and data from Eunomia
 connectionDetails <- Eunomia::getEunomiaConnectionDetails()
@@ -110,4 +110,5 @@ if (!dir.exists(fitEstimatorPath)) {
 fitEstimatorResults <- fitEstimator(trainData$Train,
                                     modelSettings = modelSettings,
                                     analysisId = 1,
-                                    analysisPath = fitEstimatorPath)
\ No newline at end of file
+                                    analysisPath = fitEstimatorPath)
+PatientLevelPrediction::savePlpModel(fitEstimatorResults, file.path(fitEstimatorPath, "plpModel"))
diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R
index fd3ce92..78c281a 100644
--- a/tests/testthat/test-Dataset.R
+++ b/tests/testthat/test-Dataset.R
@@ -122,4 +122,4 @@ test_that("Column order is preserved when features are missing", {
   expect_equal(dataset$get_cat_features()$max(),
                reducedDataset$get_cat_features()$max())
 
-})
+})
\ No newline at end of file
diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R
index f277d0c..36cc79b 100644
--- a/tests/testthat/test-Estimator.R
+++ b/tests/testthat/test-Estimator.R
@@ -2,12 +2,13 @@ catFeatures <- smallDataset$dataset$get_cat_features()$max()
 numFeatures <- smallDataset$dataset$get_numerical_features()$len()
 
 modelParameters <- list(
-  cat_features = catFeatures,
-  num_features = numFeatures,
-  size_embedding = 16,
-  size_hidden = 16,
-  num_layers = 2,
-  hidden_factor = 2
+  catFeatures = catFeatures,
+  numFeatures = numFeatures,
+  sizeEmbedding = 16,
+  sizeHidden = 16,
+  numLayers = 2,
+  hiddenFactor = 2,
+  modelType = "ResNet"
 )
 
 estimatorSettings <-
@@ -25,27 +26,27 @@ estimatorSettings <-
                     params = list(patience = 1)),
                earlyStopping = NULL)
 
-modelType <- "ResNet"
-estimator <- createEstimator(modelType = modelType,
-                             modelParameters = modelParameters,
+estimator <- createEstimator(modelParameters = modelParameters,
                              estimatorSettings = estimatorSettings)
 
 test_that("Estimator initialization works", {
 
   # count parameters in both instances
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  resNet <- reticulate::import_from_path(modelType, path = path)[[modelType]]
+  resNet <-
+    reticulate::import_from_path(modelParameters$modelType, 
+                                 path = path)[[modelParameters$modelType]]
 
-  testthat::expect_equal(
+   testthat::expect_equal(
     sum(reticulate::iterate(estimator$model$parameters(),
                             function(x) x$numel())),
-    sum(reticulate::iterate(do.call(resNet, modelParameters)$parameters(),
+    sum(reticulate::iterate(do.call(resNet, camelCaseToSnakeCaseNames(modelParameters))$parameters(),
                             function(x) x$numel()))
   )
 
   testthat::expect_equal(
     estimator$model_parameters,
-    modelParameters
+    camelCaseToSnakeCaseNames(modelParameters)
   )
 })
 
@@ -114,9 +115,7 @@ test_that("estimator fitting works", {
                                     epochs = 5,
                                     device = "cpu",
                                     metric = "loss")
-
-  estimator <- createEstimator(modelType = modelType,
-                               modelParameters = modelParameters,
+  estimator <- createEstimator(modelParameters = modelParameters,
                                estimatorSettings = estimatorSettings)
 
   sink(nullfile())
@@ -217,9 +216,7 @@ test_that("Estimator without earlyStopping works", {
                                     epochs = 1,
                                     device = "cpu",
                                     earlyStopping = NULL)
-
-  estimator2 <- createEstimator(modelType = modelType,
-                                modelParameters = modelParameters,
+  estimator2 <- createEstimator(modelParameters = modelParameters,
                                 estimatorSettings = estimatorSettings)
   sink(nullfile())
   estimator2$fit(smallDataset, smallDataset)
@@ -243,8 +240,7 @@ test_that("Early stopper can use loss and stops early", {
                                     metric = "loss",
                                     seed = 42)
 
-  estimator <- createEstimator(modelType = modelType,
-                               modelParameters = modelParameters,
+  estimator <- createEstimator(modelParameters = modelParameters,
                                estimatorSettings = estimatorSettings)
   sink(nullfile())
   estimator$fit(smallDataset, smallDataset)
@@ -271,8 +267,7 @@ test_that("Custom metric in estimator works", {
                                     metric = list(fun = metricFun,
                                                   name = "auprc",
                                                   mode = "max"))
-  estimator <- createEstimator(modelType = modelType,
-                               modelParameters = modelParameters,
+  estimator <- createEstimator(modelParameters = modelParameters,
                                estimatorSettings = estimatorSettings)
   expect_true(is.function(estimator$metric$fun))
   expect_true(is.character(estimator$metric$name))
@@ -338,9 +333,8 @@ test_that("device as a function argument works", {
 
   model <- setDefaultResNet(estimatorSettings = estimatorSettings)
   model$param[[1]]$catFeatures <- 10
-
-  estimator <- createEstimator(modelType = modelType,
-                               modelParameters = model$param[[1]],
+  model$param[[1]]$modelType <- "ResNet"
+  estimator <- createEstimator(modelParameters = model$param[[1]],
                                estimatorSettings = estimatorSettings)
 
   expect_equal(estimator$device, "cpu")
@@ -352,9 +346,9 @@ test_that("device as a function argument works", {
 
   model <- setDefaultResNet(estimatorSettings = estimatorSettings)
   model$param[[1]]$catFeatures <- 10
+  model$param[[1]]$modelType <- "ResNet"
 
-  estimator <- createEstimator(modelType = modelType,
-                               modelParameters = model$param[[1]],
+  estimator <- createEstimator(modelParameters = model$param[[1]],
                                estimatorSettings = estimatorSettings)
 
   expect_equal(estimator$device, "meta")
@@ -385,9 +379,52 @@ test_that("evaluation works on predictDeepEstimator output", {
                                      cohort = trainData$Test$labels)
   prediction$evaluationType <- 'Validation'
 
-  evaluation <- evaluatePlp(prediction, "evaluationType")
+  evaluation <-
+    PatientLevelPrediction::evaluatePlp(prediction, "evaluationType")
   
   expect_length(evaluation, 5)
   expect_s3_class(evaluation, "plpEvaluation")
   
-  })
\ No newline at end of file
+  })
+
+
+test_that("accumulationSteps as a function argument works", {
+  getSteps <- function() {
+    steps <- Sys.getenv("testAccSteps")
+    if (steps == "") {
+      steps <- "1"
+    } else {
+      steps
+    }
+  }
+
+  estimatorSettings <- setEstimator(accumulationSteps = getSteps,
+                                    learningRate = 3e-4,
+                                    batchSize = 128)
+
+  model <- setDefaultResNet(estimatorSettings = estimatorSettings)
+  model$param[[1]]$catFeatures <- 10
+  model$param[[1]]$modelType <- "ResNet"
+  estimator <- createEstimator(modelParameters = model$param[[1]],
+                               estimatorSettings = estimatorSettings)
+
+  expect_equal(estimator$accumulation_steps, 1)
+
+  Sys.setenv("testAccSteps" = "4")
+
+  estimatorSettings <- setEstimator(accumulationSteps = getSteps,
+                                    learningRate = 3e-4,
+                                    batchSize = 128)
+
+  model <- setDefaultResNet(estimatorSettings = estimatorSettings)
+  model$param[[1]]$catFeatures <- 10
+  model$param[[1]]$modelType <- "ResNet"
+
+  estimator <- createEstimator(modelParameters = model$param[[1]],
+                               estimatorSettings = estimatorSettings)
+
+  expect_equal(estimator$accumulation_steps, 4)
+
+  Sys.unsetenv("testAccSteps")
+
+})
\ No newline at end of file
diff --git a/tests/testthat/test-Finetuner.R b/tests/testthat/test-Finetuner.R
new file mode 100644
index 0000000..a8f7cb3
--- /dev/null
+++ b/tests/testthat/test-Finetuner.R
@@ -0,0 +1,53 @@
+fineTunerSettings <- setFinetuner(
+  modelPath = file.path(fitEstimatorPath, "plpModel"),
+  estimatorSettings = setEstimator(device = "cpu",
+                                   learningRate = 3e-4,
+                                   batchSize = 128,
+                                   epochs = 1)
+)
+
+test_that("Finetuner settings work", {
+  expect_equal(fineTunerSettings$param[[1]]$modelPath,
+               file.path(fitEstimatorPath, "plpModel"))
+  expect_equal(fineTunerSettings$estimatorSettings$device, "cpu")
+  expect_equal(fineTunerSettings$estimatorSettings$batchSize, 128)
+  expect_equal(fineTunerSettings$estimatorSettings$epochs, 1)  
+  expect_equal(fineTunerSettings$fitFunction, "fitEstimator")
+  expect_equal(fineTunerSettings$saveType, "file")
+  expect_equal(fineTunerSettings$modelType, "Finetuner")
+  expect_equal(fineTunerSettings$modelParamNames, "modelPath")
+  expect_equal(class(fineTunerSettings), "modelSettings")
+  expect_equal(attr(fineTunerSettings$param, "settings")$modelType, "Finetuner")
+  expect_error(setFinetuner(modelPath = "notAPath", estimatorSettings = setEstimator()))
+  expect_error(setFinetuner(modelPath = fitEstimatorPath, estimatorSettings = setEstimator()))
+  fakeDir <- file.path(fitEstimatorPath, "fakeDir")
+  fakeSubdir <- file.path(fakeDir, "model")
+  dir.create(fakeSubdir, recursive = TRUE)
+  expect_error(setFinetuner(modelPath = fakeDir, estimatorSettings = setEstimator()))
+  })
+
+test_that("Finetuner fitEstimator works", {
+  fineTunerPath <- file.path(testLoc, "fineTuner")
+  dir.create(fineTunerPath)
+  fineTunerResults <- fitEstimator(trainData$Train,
+                                   modelSettings = fineTunerSettings,
+                                   analysisId = 1,
+                                   analysisPath = fineTunerPath)
+  expect_equal(which(fineTunerResults$covariateImportance$isNumeric), 
+               which(fitEstimatorResults$covariateImportance$isNumeric))
+  expect_equal(nrow(fineTunerResults$covariateImportance),
+               nrow(fitEstimatorResults$covariateImportance))
+  expect_equal(fineTunerResults$covariateImportance$columnId,
+               fitEstimatorResults$covariateImportance$columnId)
+  expect_equal(fineTunerResults$covariateImportance$covariateId,
+               fitEstimatorResults$covariateImportance$covariateId)
+  
+  fineTunedModel <- torch$load(file.path(fineTunerResults$model,
+                                         "DeepEstimatorModel.pt"))
+  expect_true(fineTunedModel$estimator_settings$finetune)
+  expect_equal(fineTunedModel$estimator_settings$finetune_model_path, 
+               normalizePath(file.path(fitEstimatorPath, "plpModel", "model",
+                         "DeepEstimatorModel.pt")))
+  expect_equal(fineTunedModel$model_parameters$model_type, 
+               fitEstimatorResults$modelDesign$modelSettings$modelType)
+})
diff --git a/tests/testthat/test-GradientAccumulation.R b/tests/testthat/test-GradientAccumulation.R
new file mode 100644
index 0000000..c2a0b7e
--- /dev/null
+++ b/tests/testthat/test-GradientAccumulation.R
@@ -0,0 +1,171 @@
+
+
+generateData <- function(observations, features, totalFeatures = 6,
+                         numCovs = FALSE) {
+  rowId <- rep(1:observations, each = features)
+  withr::with_seed(42, {
+    columnId <- sample(1:totalFeatures, observations * features, replace = TRUE)
+  })
+  covariateValue <- rep(1, observations * features)
+  covariates <- data.frame(rowId = rowId, columnId = columnId, covariateValue = covariateValue)  
+  if (numCovs) {
+    numRow <- 1:observations
+    numCol <- rep(totalFeatures + 1, observations)
+    withr::with_seed(42, {
+      numVal <- runif(observations)
+    })
+    numCovariates <- data.frame(rowId = as.integer(numRow),
+                                columnId = as.integer(numCol),
+                                covariateValue = numVal)
+    covariates <- rbind(covariates, numCovariates)
+  }
+  
+  labels <- as.numeric(sample(0:1, observations, replace = TRUE))
+  data <- Andromeda::andromeda(covariates = covariates)
+  data <- attr(data, "dbname")
+  Data <- reticulate::import_from_path("Dataset", path = path)$Data
+  dataset <- Data(data, labels)
+  return(dataset)
+}
+
+dataset <- generateData(observations = 5, features = 3, totalFeatures = 6)
+
+model <- reticulate::import_from_path("ResNet", path = path)$ResNet
+modelParams <- list("num_layers" = 1L,
+                    "size_hidden" = 32,
+                    "size_embedding" = 32,
+                    "cat_features" = dataset$get_cat_features()$max(),
+                    "normalization" = torch$nn$LayerNorm)
+Estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator
+estimatorSettings <- list(
+  learningRate = 3e-4,
+  weightDecay = 1e-6,
+  device = "cpu",
+  batch_size = 4,
+  epochs = 1,
+  seed = 32,
+  find_l_r = FALSE,
+  accumulation_steps = NULL,
+  optimizer = torch$optim$AdamW,
+  criterion = torch$nn$BCEWithLogitsLoss
+)
+
+estimator <- Estimator(
+  model = model,
+  model_parameters = modelParams,
+  estimator_settings = estimatorSettings
+)
+
+
+test_that("Gradient accumulation provides same results as without, ", {
+  predsWithout <- estimator$predict_proba(dataset)
+  
+  estimatorSettings$accumulation_steps <- 2
+  estimatorWith <- Estimator(
+    model = model,
+    model_parameters = modelParams,
+    estimator_settings = estimatorSettings
+  )
+  predsWith <- estimatorWith$predict_proba(dataset)
+  
+  expect_equal(predsWithout, predsWith, tolerance = 1e-4)
+  
+  dataset <- generateData(observations = 5, features = 3, totalFeatures = 6, 
+                          numCovs = TRUE) 
+  modelParams$num_features <- 1L 
+  estimatorWith <- Estimator(
+    model = model,
+    model_parameters = modelParams,
+    estimator_settings = estimatorSettings
+  )
+  estimatorSettings$accumulation_steps <- 1
+  estimator <- Estimator(
+    model = model,
+    model_parameters = modelParams,
+    estimator_settings = estimatorSettings
+  )
+  predsWith <- estimator$predict_proba(dataset)
+  predsWithout <- estimatorWith$predict_proba(dataset)
+  
+  expect_equal(predsWithout, predsWith, tolerance = 1e-4)
+})
+
+test_that("Loss is equal without dropout and layernorm",  {
+  dataloader <- torch$utils$data$DataLoader(dataset, batch_size = NULL,
+                                            sampler = torch$utils$data$BatchSampler(
+                                              torch$utils$data$RandomSampler(dataset),
+                                              batch_size = 4L,
+                                              drop_last = FALSE
+                                            ))  
+  loss <- estimator$fit_epoch(dataloader)
+  
+  estimatorSettings$accumulation_steps <- 2
+  estimatorWith <- Estimator(
+    model = model,
+    model_parameters = modelParams,
+    estimator_settings = estimatorSettings
+  )
+  lossWith <- estimatorWith$fit_epoch(dataloader)
+  
+  expect_equal(loss, lossWith, tolerance = 1e-2)
+  
+})
+
+test_that("LR finder works same with and without accumulation", {
+  modelParams$modelType <- "ResNet"
+  
+  lrSettings <- list(minLr = 1e-8,
+                     maxLr = 0.01,
+                     numLr = 10L,
+                     divergenceThreshold = 1.1)
+
+  lr <- getLR(estimatorSettings = estimatorSettings,
+              modelParameters = modelParams,
+              dataset = dataset,
+              lrSettings = lrSettings)
+  
+  estimatorSettings$accumulation_steps <- 2
+ 
+  lrWith <- getLR(estimatorSettings = estimatorSettings,
+                  modelParameters = modelParams,
+                  dataset = dataset,
+                  lrSettings = lrSettings) 
+  
+  expect_equal(lr, lrWith, tolerance = 1e-10)
+})
+
+test_that("Gradient accumulation works when batch is smaller than batch_size * steps", {
+  dataset <- generateData(observations = 50, features = 3, totalFeatures = 6)
+  
+  model <- reticulate::import_from_path("ResNet", path = path)$ResNet
+  modelParams <- list("num_layers" = 1L,
+                      "size_hidden" = 32,
+                      "size_embedding" = 32,
+                      "cat_features" = 6,
+                      "normalization" = torch$nn$LayerNorm)
+  Estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator
+  estimatorSettings <- list(
+    learningRate = 3e-4,
+    weightDecay = 1e-6,
+    device = "cpu",
+    batch_size = 20,
+    epochs = 1,
+    seed = 32,
+    find_l_r = FALSE,
+    accumulation_steps = 4,
+    optimizer = torch$optim$AdamW,
+    criterion = torch$nn$BCEWithLogitsLoss
+  )
+  
+  estimator <- Estimator(
+    model = model,
+    model_parameters = modelParams,
+    estimator_settings = estimatorSettings
+  ) 
+  
+  preds <- estimator$predict_proba(dataset)
+  
+  expect_equal(length(preds), 50)
+  expect_true(all(preds >= 0))
+  expect_true(all(preds <= 1))
+})
diff --git a/tests/testthat/test-LRFinder.R b/tests/testthat/test-LRFinder.R
index 5c46af8..4495612 100644
--- a/tests/testthat/test-LRFinder.R
+++ b/tests/testthat/test-LRFinder.R
@@ -30,28 +30,29 @@ test_that("LR scheduler that changes per batch works", {
 
 
 test_that("LR finder works", {
-  lrFinder <-
-    createLRFinder(modelType = "ResNet",
-                   modelParameters =
-                   list(cat_features =
-                        dataset$get_cat_features()$max(),
-                        num_features =
-                        dataset$get_numerical_features()$len(),
-                        size_embedding = 32L,
-                        size_hidden = 64L,
-                        num_layers = 1L,
-                        hidden_factor = 1L),
-                   estimatorSettings = setEstimator(batchSize = 32L,
-                                                    seed = 42),
-                   lrSettings = list(minLr = 3e-4,
-                                     maxLr = 10.0,
-                                     numLr = 20L,
-                                     divergenceThreshold = 1.1))
-
-  lr <- lrFinder$get_lr(dataset)
-
-  expect_true(lr <= 10.0)
-  expect_true(lr >= 3e-4)
+  estimatorSettings <- setEstimator(batchSize = 32L,
+                                    seed = 42)
+  modelParameters <- list(cat_features = dataset$get_cat_features()$max(),
+                          num_features = dataset$get_numerical_features()$len(),
+                          size_embedding = 32L,
+                          size_hidden = 64L,
+                          num_layers = 1L,
+                          hidden_factor = 1L,
+                          modelType = "ResNet")
+ estimatorSettings <- estimatorSettings
+ lrSettings <- list(minLr = 1e-8,
+                    maxLr = 0.01,
+                    numLr = 20L,
+                    divergenceThreshold = 1.1)  
+ # initial LR should be the minLR
+  
+  lr <- getLR(modelParameters = modelParameters,
+              estimatorSettings = estimatorSettings,
+              lrSettings = lrSettings,
+              dataset = dataset)
+  tol <- 1e-10
+  expect_lte(lr, 0.01 + tol)
+  expect_gte(lr, 1e-08 - tol)
 })
 
 test_that("LR finder works with device specified by a function", {
@@ -60,25 +61,27 @@ test_that("LR finder works with device specified by a function", {
     dev <- "cpu"
     dev
   }
-  lrFinder <- createLRFinder(
-    model = "ResNet",
-    modelParameters =
+  modelParameters <-
       list(cat_features = dataset$get_cat_features()$max(),
            num_features = dataset$get_numerical_features()$len(),
            size_embedding = 8L,
            size_hidden = 16L,
            num_layers = 1L,
-           hidden_factor = 1L),
-    estimatorSettings = setEstimator(batchSize = 32L,
-                                     seed = 42,
-                                     device = deviceFun),
-    lrSettings = list(minLr = 3e-4,
-                      maxLr = 10.0,
-                      numLr = 20L,
-                      divergenceThreshold = 1.1)
-  )
-  lr <- lrFinder$get_lr(dataset)
+           hidden_factor = 1L,
+           modelType = "ResNet")
+  estimatorSettings <- setEstimator(batchSize = 32L,
+                                    seed = 42,
+                                    device = deviceFun)
+  lrSettings <- list(minLr = 1e-6,
+                     maxLr = 0.03,
+                     numLr = 20L,
+                     divergenceThreshold = 1.1)
+  lr <- getLR(modelParameters = modelParameters,
+              estimatorSettings = estimatorSettings,
+              lrSettings = lrSettings,
+              dataset = dataset)
 
-  expect_true(lr <= 10.0)
-  expect_true(lr >= 3e-4)
+  tol <- 1e-8
+  expect_lte(lr, 0.03 + tol) 
+  expect_gte(lr, 1e-6 - tol)
 })
diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MultiLayerPerceptron.R
similarity index 96%
rename from tests/testthat/test-MLP.R
rename to tests/testthat/test-MultiLayerPerceptron.R
index a470664..3470eba 100644
--- a/tests/testthat/test-MLP.R
+++ b/tests/testthat/test-MultiLayerPerceptron.R
@@ -18,7 +18,7 @@ modelSettings <- setMultiLayerPerceptron(
 test_that("setMultiLayerPerceptron works", {
   testthat::expect_s3_class(object = modelSettings, class = "modelSettings")
 
-  testthat::expect_equal(modelSettings$fitFunction, "fitEstimator")
+  testthat::expect_equal(modelSettings$fitFunction, "DeepPatientLevelPrediction::fitEstimator")
 
   testthat::expect_true(length(modelSettings$param) > 0)
 
@@ -86,7 +86,7 @@ test_that("MLP with runPlp working checks", {
 
 
 test_that("MLP nn-module works ", {
-  mlp <- reticulate::import_from_path("MLP", path = path)$MLP
+  mlp <- reticulate::import_from_path("MultiLayerPerceptron", path = path)$MultiLayerPerceptron
   model <- mlp(
     cat_features = 5,
     num_features = 1,
@@ -123,7 +123,7 @@ test_that("MLP nn-module works ", {
     activation = torch$nn$ReLU,
     normalization = torch$nn$BatchNorm1d,
     dropout = 0.3,
-    d_out = 1L
+    dim_out = 1L
   )
   output <- model(input)
   # model works without numeric variables
diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R
index 76e3d9f..aa88973 100644
--- a/tests/testthat/test-ResNet.R
+++ b/tests/testthat/test-ResNet.R
@@ -18,7 +18,7 @@ resSet <- setResNet(
 test_that("setResNet works", {
   testthat::expect_s3_class(object = resSet, class = "modelSettings")
 
-  testthat::expect_equal(resSet$fitFunction, "fitEstimator")
+  testthat::expect_equal(resSet$fitFunction, "DeepPatientLevelPrediction::fitEstimator")
 
   testthat::expect_true(length(resSet$param) > 0)
 
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
index 62fdf12..8526e61 100644
--- a/tests/testthat/test-Transformer.R
+++ b/tests/testthat/test-Transformer.R
@@ -15,7 +15,7 @@ settings <- setTransformer(
 
 test_that("Transformer settings work", {
   testthat::expect_s3_class(object = settings, class = "modelSettings")
-  testthat::expect_equal(settings$fitFunction, "fitEstimator")
+  testthat::expect_equal(settings$fitFunction, "DeepPatientLevelPrediction::fitEstimator")
   testthat::expect_true(length(settings$param) > 0)
   testthat::expect_error(setTransformer(
     numBlocks = 1, dimToken = 50,
diff --git a/vignettes/BuildingDeepModels.Rmd b/vignettes/BuildingDeepModels.Rmd
index e95e052..87815ad 100644
--- a/vignettes/BuildingDeepModels.Rmd
+++ b/vignettes/BuildingDeepModels.Rmd
@@ -12,13 +12,6 @@ header-includes:
   - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r  utils::packageVersion("PatientLevelPrediction")`}
   - \fancyfoot[CO,CE]{DeepPatientLevelPrediction Package Version `r  utils::packageVersion("DeepPatientLevelPrediction")`}
 output:
-  pdf_document:
-    includes:
-      in_header: preamble.tex
-    number_sections: yes
-    toc: yes
-  word_document:
-    toc: yes
   html_document:
     number_sections: yes
     toc: yes
@@ -29,7 +22,7 @@ editor_options:
 
 ```{=html}
 <!--
-%\VignetteEngine{knitr}
+%\VignetteEngine{knitr::rmarkdown}
 %\VignetteIndexEntry{Building Deep Learning Models}
 -->
 ```
@@ -83,9 +76,7 @@ current implementation allows us to perform research at scale on the
 value and limitations of Deep Learning using observational healthcare
 data.
 
-In the package we have used
-[torch](https://cran.r-project.org/web/packages/torch/index.html) but we
-invite the community to add other backends.
+In the package we use `pytorch` through the `reticulate` package.
 
 Many network architectures have recently been proposed and we have
 implemented a number of them, however, this list will grow in the near
@@ -98,7 +89,7 @@ examples below.
 
 Note that training Deep Learning models is computationally intensive,
 our implementation therefore supports both GPU and CPU. A GPU
-is highly recommended for Deep Learning!
+is highly recommended and neccesary for most models for Deep Learning!
 
 ## Requirements
 
@@ -110,7 +101,7 @@ installing the package can be found
 
 The `DeepPatientLevelPrediction` package provides additional model
 settings that can be used within the `PatientLevelPrediction` package
-`runPlp()` function. To use both packages you first need to pick the
+`runPlp()` and `runMultiplePlp()` functions. To use both packages you first need to pick the
 deep learning architecture you wish to fit (see below) and then you
 specify this as the modelSettings inside `runPlp()`.
 
@@ -120,7 +111,7 @@ specify this as the modelSettings inside `runPlp()`.
 plpData <- PatientLevelPrediction::loadPlpData('locationOfData')
 
 # pick the set<Model> from  DeepPatientLevelPrediction
-deepLearningModel <- DeepPatientLevelPrediction::setResNet()
+deepLearningModel <- DeepPatientLevelPrediction::setDefaultResNet()
 
 # use PatientLevelPrediction to fit model
 deepLearningResult <- PatientLevelPrediction::runPlp(
@@ -138,7 +129,7 @@ deepLearningResult <- PatientLevelPrediction::runPlp(
 We implemented the following non-temporal (2D data matrix)
 architectures:
 
-## Simple MLP
+## Simple MultiLayerPerceptron
 
 ### Overall concept
 
@@ -147,7 +138,7 @@ input layer, one or more hidden layers and an output layer. The model
 takes in the input feature values and feeds these forward through the
 graph to determine the output class. A process known as
 'backpropagation' is used to train the model. Backpropagation requires
-labelled data and involves automatically calculating the derivative of
+some ground truth and involves automatically calculating the derivative of
 the model parameters with respect to the the error between the model's
 predictions and ground truth. Then the model learns how to adjust the
 model's parameters to reduce the error.
@@ -170,8 +161,8 @@ value of `0.2` means that 20% of the layers inputs will be set to
 0. This is used to reduce overfitting.
 
 The `sizeEmbedding` input specifies the size of the embedding used. The first
-layer is an embedding layer which converts each sparse feature to a dense vector
-which it learns. An embedding is a lower dimensional projection of the features
+layer is an embedding layer which converts each sparse feature to a dense learned 
+vector. An embedding is a lower dimensional projection of the features
 where distance between points is a measure of similarity.
 
 The `weightDecay` input corresponds to the weight decay in the objective
@@ -269,11 +260,11 @@ mlpResult <- PatientLevelPrediction::runPlp(
 ### Overall concept
 
 Deep learning models are often trained via a process known as gradient
-descent during backpropogation. During this process the network weights
+descent. During this process the network weights
 are updated based on the gradient of the error function for the current
 weights. However, as the number of layers in the network increase, there
-is a greater chance of experiencing an issue known as the vanishing or
-exploding gradient during this process. The vanishing or exploding
+is a greater chance of experiencing an issue known vanishing or
+exploding gradients. The vanishing or exploding
 gradient is when the gradient goes to 0 or infinity, which negatively
 impacts the model fitting.
 
@@ -284,7 +275,7 @@ non-adjacent layers, termed a 'skip connection'.
 The ResNet calculates embeddings for every feature and then averages
 them to compute an embedding per patient.
 
-This implementation of a ResNet for tabular data is based on [this
+Our implementation of a ResNet for tabular data is based on [this
 paper](https://arxiv.org/abs/2106.11959).
 
 ### Example
@@ -302,7 +293,7 @@ function to specify the hyperparameter settings for the network.
 
 `sizeHidden`: How many neurons in each hidden layer
 
-`hiddenFactor`: How much to increase number of neurons in each layer
+`hiddenFactor`: How much to increase number of neurons in each layer (see paper)
 
 `residualDropout` and`hiddenDropout` : How much dropout to apply in
 hidden layer or residual connection
@@ -338,11 +329,11 @@ random samples to use
 
 For example, the following code will fit a two layer ResNet where each
 layer has 32 neurons which increases by a factor of two before
-decreasing againg (hiddenFactor). 10% of inputs to each layer and
-residual connection within the layer are randomly zeroed. The embedding
-layer has 32 neurons. Learning rate of 3e-4 with weight decay of 1e-6 is
-used for the optimizer. No hyperparameter search is done since each
-input only includes one option.
+decreasing again (hiddenFactor). 10% of inputs to each layer and
+residual connection within the layer are randomly zeroed during training but 
+not testing.The embedding layer has 32 neurons. Learning rate of 3e-4 with 
+weight decay of 1e-6 is used for the optimizer. No hyperparameter search is done
+since each input only includes one option.
 
 ```{r, eval=FALSE}
 
@@ -422,14 +413,15 @@ ResNet.
 `numBlocks` : How many Transformer blocks to use, each block includes a
 self-attention layer and a feedforward block with two linear layers.
 
-`dimToken` : Dimension of the embedding for each feature's embedding
+`dimToken` : Dimension of the embedding for each feature.
 
 `dimOut` : Dimension of output, for binary problems this is 1.
 
-`numHeads` : Number of attention heads for the self-attention
+`numHeads` : Number of attention heads for the self-attention, `dimToken` needs
+to be divisible by `numHeads`.
 
-`attDropout` , `ffnDropout` and `resDropout` : How much dropout to apply
-on attentions, in feedforward block or in residual connections
+`attDropout`, `ffnDropout` and `resDropout` : How much dropout to apply
+on attentions, feedforward block or in residual connections
 
 `dimHidden` : How many neurons in linear layers inside the feedforward
 block
diff --git a/vignettes/FirstModel.Rmd b/vignettes/FirstModel.Rmd
index 789ec26..b162220 100644
--- a/vignettes/FirstModel.Rmd
+++ b/vignettes/FirstModel.Rmd
@@ -12,13 +12,6 @@ header-includes:
   - \renewcommand{\headrulewidth}{0.4pt}
   - \renewcommand{\footrulewidth}{0.4pt}
 output:
-  pdf_document:
-    includes:
-      in_header: preamble.tex
-    number_sections: yes
-    toc: yes
-  word_document:
-    toc: yes
   html_document:
     number_sections: yes
     toc: yes
@@ -26,7 +19,7 @@ output:
 
 ```{=html}
 <!--
-%\VignetteEngine{knitr}
+%\VignetteEngine{knitr::rmarkdown}
 %\VignetteIndexEntry{Developing your first DeepPLP model}
 -->
 ```
@@ -73,7 +66,7 @@ covariateSettings <- FeatureExtraction::createCovariateSettings(
 )
 ```
 
-This means we are extracting gender as a binary variable, age as a continuous variable and conditions occurring in the long term window, which is by default 365 days prior.
+This means we are extracting gender as a binary variable, age as a continuous variable and conditions occurring in the long term window, which is by default 365 days prior to index. If you want to know more about these terms we recommend checking out the [book of OHDSI](https://ohdsi.github.io/TheBookOfOhdsi/).
 
 Next we need to define our database details, which defines from which database we are getting which cohorts. Since we don't have a database we are using Eunomia.
 
@@ -126,11 +119,11 @@ modelSettings <- setDefaultResNet(
 
 ```
 
-We still need to define a few parameters. Device defines on which device to train the model. Usually deep learning models are slow to train so they need a GPU. However this example is small enough that we can use a CPU If you have access to a GPU you can try changing the device to `'cuda'` and see how much faster it goes.
+We still need to define a few parameters. Device defines on which device to train the model. Usually deep learning models are slow to train so they need a GPU. However this example is small enough that we can use a CPU. If you have access to a GPU you can try changing the device to `'cuda'` and see how much faster it goes.
 
 We also need to define our batch size. Usually in deep learning the model sees only a small chunk of the data at a time, in this case 256 patients. After that the model is updated before seeing the next batch. The batch order is random. This is called stochastic gradient descent.
 
-Finally we define our epochs. This is how long we will train the model. One epoch means the model has seen all the data once.
+Finally we define our epochs. This is how long we will train the model. One epoch means the model has seen all the data once. In this case we will train the model for 3 epochs.
 
 Now all that is left is using the PLP to train our first deep learning model. If you have used the PLP this should look familiar to you.
 
@@ -143,6 +136,7 @@ plpResults <- PatientLevelPrediction::runPlp(plpData = plpData,
                populationSettings = populationSettings
                                                       )
 ```
+
 On my computer this takes about 20 seconds per epoch. While you probably won't see any kind of good performance using this model and this data, at least the training loss should be decreasing in the printed output.
 
 Congratulations you have just developed your first deep learning model!
diff --git a/vignettes/Installing.Rmd b/vignettes/Installing.Rmd
index 1aded4d..ed2e897 100644
--- a/vignettes/Installing.Rmd
+++ b/vignettes/Installing.Rmd
@@ -12,13 +12,6 @@ header-includes:
   - \renewcommand{\headrulewidth}{0.4pt}
   - \renewcommand{\footrulewidth}{0.4pt}
 output:
-  pdf_document:
-    includes:
-      in_header: preamble.tex
-    number_sections: yes
-    toc: yes
-  word_document:
-    toc: yes
   html_document:
     number_sections: yes
     toc: yes
@@ -26,7 +19,7 @@ output:
 
 ```{=html}
 <!--
-%\VignetteEngine{knitr}
+%\VignetteEngine{knitr::rmarkdown}
 %\VignetteIndexEntry{Installing DeepPLP}
 -->
 ```
@@ -106,7 +99,7 @@ This should install the required python packages. If that doesn't happen it can
 
 ```
 library(DeepPatientLevelPrediction)
-torch$trandn(10L)
+torch$randn(10L)
 ```
 
 This should print out a tensor with ten different values. 
diff --git a/vignettes/TransferLearning.Rmd b/vignettes/TransferLearning.Rmd
new file mode 100644
index 0000000..7fddb11
--- /dev/null
+++ b/vignettes/TransferLearning.Rmd
@@ -0,0 +1,193 @@
+---
+title: "How to use DeepPatientLevelPrediction for Transfer Learning"
+author: "Egill Fridgeirsson"
+date: '`r Sys.Date()`'
+header-includes:
+  - \usepackage{fancyhdr}
+  - \pagestyle{fancy}
+  - \fancyhead{}
+  - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r  utils::packageVersion("PatientLevelPrediction")`}
+  - \fancyfoot[CO,CE]{DeepPatientLevelPrediction Package Version `r  utils::packageVersion("DeepPatientLevelPrediction")`}
+  - \fancyfoot[LE,RO]{\thepage}
+  - \renewcommand{\headrulewidth}{0.4pt}
+  - \renewcommand{\footrulewidth}{0.4pt}
+output:
+  html_document:
+    number_sections: yes
+    toc: yes
+---
+
+```{=html}
+<!--
+%\VignetteEngine{knitr::rmarkdown}
+%\VignetteIndexEntry{Transfer Learning}
+-->
+```
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Introduction
+
+This vignette describes how to use the DeepPatientLevelPrediction package for transfer learning. Transfer learning is a machine learning technique where a model trained on one task is used as a starting point for training a model on a different task. This can be useful when you have a small dataset for the new task, but a large dataset for a related task. In this vignette, we will show how to use the DeepPatientLevelPrediction package to perform transfer learning on a patient-level prediction task.
+
+# Training initial model
+
+The first step in transfer learning is to train an initial model. In this example, we will train a model to predict the risk of a patient developing a certain condition based on their electronic health record data. We will use the `Eunomia` package to access a dataset to train the model. The following code shows how to train the initial model:
+
+```{r, message=FALSE, eval=FALSE}
+library(DeepPatientLevelPrediction)
+
+# Get connection details for the Eunomia dataset and create the cohorts
+connectionDetails <- Eunomia::getEunomiaConnectionDetails()
+Eunomia::createCohorts(connectionDetails)
+```
+
+The default Eunomia package includes four cohorts. Gastrointestinal bleeding (`GiBleed`) and use of three different drugs, `diclofenac`, `NSAIDS` and `celecoxib`. Usually then we would use one of three drug cohorts as our target cohort and then predict the risk of gastrointestinal bleeding. The `cohort_definition_ids` of these are: `celecoxib: 1`, `diclofenac: 2`, `GiBleed: 3` and `NSAIDS: 4`.
+
+After creating the cohorts we can see that there are most patients in the `NSAIDS` cohort. We will use this cohort as our target cohort for the initial model. There are least patients in the `diclofenac` cohort (excluding `GiBleed`), so we will use this cohort as our target cohort for the transfer learning model.
+
+```{r, message=FALSE, eval=FALSE}
+# create some simple covariate settings using Sex, Age and Long-term conditions and drug use in the last year.
+covariateSettings <- FeatureExtraction::createCovariateSettings(
+  useDemographicsGender = TRUE,
+  useDemographicsAge = TRUE,
+  useConditionOccurrenceLongTerm = TRUE,
+  useDrugEraLongTerm = TRUE,
+  endDays = 0
+)
+
+# Information about the database. In Eunomia sqlite there is only one schema, main and the cohorts are in a table named `cohort` which is the default. 
+databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
+  connectionDetails = connectionDetails,
+  cdmDatabaseId = "2", # Eunomia version used
+  cdmDatabaseSchema = "main",
+  targetId = 4,
+  outcomeIds = 3,
+  cdmDatabaseName = "eunomia"
+)
+
+# Let's now extract the plpData object from the database
+plpData <- PatientLevelPrediction::getPlpData(
+  databaseDetails = databaseDetails,
+  covariateSettings = covariateSettings,
+  restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings()
+)
+
+```
+
+Now we can set up our initial model development. We will use a simple `ResNet`.
+
+```{r, message=FALSE, eval=FALSE}
+modelSettings <- setResNet(numLayers = c(2),
+                           sizeHidden = 128,
+                           hiddenFactor = 1,
+                           residualDropout = 0.1,
+                           hiddenDropout = 0.1,
+                           sizeEmbedding = 128,
+                           estimatorSettings = setEstimator(
+                             learningRate = 3e-4,
+                             weightDecay = 0,
+                             device = "cpu", # use cuda here if you have a gpu
+                             batchSize = 256,
+                             epochs = 5,
+                             seed = 42
+                           ),
+                           hyperParamSearch = "random",
+                           randomSample = 1)
+
+plpResults <- PatientLevelPrediction::runPlp(
+  plpData = plpData,
+  outcomeId = 3, # 4 is the id of GiBleed
+  modelSettings = modelSettings,
+  analysisName = "Nsaids_GiBleed",
+  analysisId = "1",
+  # Let's predict the risk of Gibleed in the year following start of NSAIDs use
+  populationSettings = PatientLevelPrediction::createStudyPopulationSettings(
+    requireTimeAtRisk = FALSE,
+    firstExposureOnly = TRUE,
+    riskWindowStart = 1,
+    riskWindowEnd = 365
+  ),
+  splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 42),
+  saveDirectory = "./output" # save in a folder in the current directory
+)
+
+```
+
+This should take a few minutes on a cpu. Now that we have a model developed we can further finetune it on the `diclofenac` cohort. First we need to extract it.
+
+```{r, message=FALSE, eval=FALSE}
+databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
+  connectionDetails = connectionDetails,
+  cdmDatabaseId = "2", # Eunomia version used
+  cdmDatabaseSchema = "main",
+  targetId = 2, # diclofenac cohort
+  outcomeIds = 3,
+  cdmDatabaseName = "eunomia"
+)
+
+plpDataTransfer <- PatientLevelPrediction::getPlpData(
+  databaseDetails = databaseDetails,
+  covariateSettings = covariateSettings, # same as for the developed model
+  restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings()
+)
+
+```
+
+Now we can set up our transfer learning model development. For this we need to use a different modelSettings function. `setFinetuner`. We also need to know the path to the previously developed model. This should be of the form `outputDir/analysisId/plpResult/model` where outputDir is the directory specified when we develop our model and analysisId is the id we gave the analysis. In this case it is `1` and the path to the model is: `./output/1/plpResult/model`.
+
+```{r, message=FALSE, eval=FALSE}
+modelSettingsTransfer <- setFinetuner(modelPath = './output/1/plpResult/model',
+                                      estimatorSettings = setEstimator(
+                                        learningRate = 3e-4,
+                                        weightDecay = 0,
+                                        device = "cpu", # use cuda here if you have a gpu
+                                        batchSize = 256,
+                                        epochs = 5,
+                                        seed = 42
+                                      ))
+
+```
+
+Currently the basic transfer learning works by loading the previously trained model and resetting it's last layer, often called the prediction head. Then it will train only the parameters in this last layer. The hope is that the other layer's have learned some generalizable representations of our data and by modifying the last layer we can mix those representations to suit the new task. 
+
+```{r, message=FALSE, eval=FALSE}
+plpResultsTransfer <- PatientLevelPrediction::runPlp(
+  plpData = plpDataTransfer,
+  outcomeId = 3,
+  modelSettings = modelSettingsTransfer,
+  analysisName = "Diclofenac_GiBleed",
+  analysisId = "2",
+  populationSettings = PatientLevelPrediction::createStudyPopulationSettings(
+    requireTimeAtRisk = FALSE,
+    firstExposureOnly = TRUE,
+    riskWindowStart = 1,
+    riskWindowEnd = 365
+  ),
+  splitSettings = PatientLevelPrediction::createDefaultSplitSetting(splitSeed = 42),
+  saveDirectory = "./outputTransfer" # save in a folder in the current directory
+)
+```
+
+This should be much faster since it's only training the last layer. Unfortunately the results are bad. However this is a toy example on synthetic toy data but the process on large observational data is exactly the same. 
+
+# Conclusion
+Now you have finetuned a model on a new cohort using transfer learning. This can be useful when you have a small dataset for the new task, but a large dataset for a related task or from a different database. The DeepPatientLevelPrediction package makes it easy to perform transfer learning on patient-level prediction tasks.
+
+# Acknowledgments
+
+Considerable work has been dedicated to provide the
+`DeepPatientLevelPrediction` package.
+
+```{r tidy=TRUE,eval=TRUE}
+citation("DeepPatientLevelPrediction")
+```
+
+**Please reference this paper if you use the PLP Package in your work:**
+
+[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and
+implementation of a standardized framework to generate and evaluate
+patient-level prediction models using observational healthcare data. J
+Am Med Inform Assoc.
+2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032)