v2.1 (#118)

- Add transfer learning - Add gradient accumulation - Build docker container on release action - GPU memory cleanup on memory errors
OHDSI · Jul 9, 2024 · bb7c453 · bb7c453
1 parent 03fdb3d
commit bb7c453
Show file tree

Hide file tree

Showing 47 changed files with 1,793 additions and 594 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -8,4 +8,8 @@
 ^extras$
 ^deploy.sh$
 ^compare_versions$
-^.mypy_cache$
+^.mypy_cache$
+^inst/python/__pycache__
+^.*\.pt$
+^doc$
+^Meta$
diff --git a/.github/workflows/R_CDM_check_hades.yaml b/.github/workflows/R_CDM_check_hades.yaml
@@ -67,22 +67,22 @@ jobs:
           while read -r cmd
           do
             eval sudo $cmd
-          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
+          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "22.04"))')
           
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
+          cache: always
           extra-packages: any::rcmdcheck
           needs: check
 
       - name: setup r-reticulate venv
         shell: Rscript {0}
         run: |
           python_packages <- 
-            c("polars", "tqdm", "connectorx", "pyarrow", "scikit-learn")
+            c("polars", "tqdm", "connectorx", "pyarrow", "pynvml", "numpy==1.26.4")
           
           library(reticulate)
-          virtualenv_create("r-reticulate", Sys.which("python"))
-          virtualenv_install("r-reticulate", python_packages)
+          virtualenv_create("r-reticulate", Sys.which("python"), packages=python_packages)
           virtualenv_install("r-reticulate", "torch", pip_options = c("--index-url https://download.pytorch.org/whl/cpu"))
           
           path_to_python <- virtualenv_python("r-reticulate")
@@ -95,24 +95,24 @@ jobs:
           error-on: '"warning"'
           check-dir: '"check"'
 
-      - name: Upload source package
-        if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
-        uses: actions/upload-artifact@v2
-        with:
-          name: package_tarball
-          path: check/*.tar.gz
-
       - name: Install covr
-        if: runner.os == 'Windows'
+        if: runner.os == 'ubuntu-22.04'
         run: |
           remotes::install_cran("covr")
         shell: Rscript {0}
 
       - name: Test coverage
-        if: runner.os == 'Windows'
-        run: covr::codecov()
+        if: runner.os == 'ubuntu-22.04'
+        run: covr::codecov(token = "${{ secrets.CODECOV_TOKEN }}")
         shell: Rscript {0}
 
+      - name: Upload source package
+        if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
+        uses: actions/upload-artifact@v2
+        with:
+          name: package_tarball
+          path: check/*.tar.gz
+
   Release:
     needs: R-CMD-Check
 

diff --git a/.github/workflows/release_docker.yaml b/.github/workflows/release_docker.yaml
@@ -0,0 +1,84 @@
+# When a new release is published,
+# upload image to Dockerhub.
+#
+# Requires the following repository secrets:
+# - DOCKER_IMAGE - Configured as a secret so it can be configured per fork.
+# - DOCKER_HUB_USERNAME
+# - DOCKER_HUB_ACCESS_TOKEN
+# - GITHUBPAT - The github account to use for downloading CRAN dependencies.
+#                      Needed to avoid "API rate limit exceeded" from github.
+name: Release Docker
+
+on:
+  push:
+    branches:
+      - 'main'
+    tags:
+      - 'v*'
+  workflow_dispatch:
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    env:
+      DOCKER_IMAGE: 'ohdsi/deep_plp' 
+    steps:
+      - uses: actions/checkout@v4
+
+      # ------------------------------------
+      # The pattern for the following steps is specified
+      # in OHDSI/WebAPI.
+
+      # Add Docker labels and tags
+      - name: Docker meta
+        id: docker_meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.DOCKER_IMAGE }}
+          tags: |
+            type=semver,pattern={{version}}
+      # Setup docker build environment
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+      - name: Set build parameters
+        id: build_params
+        run: |
+          echo "SHA8=${GITHUB_SHA::8}" >> $GITHUB_ENV
+      - name: Build and push
+        id: docker_build
+        uses: docker/build-push-action@v6
+        with:
+          context: ./
+          cache-from: type=gha
+          cache-to: type=gha, mode=max
+          file: Dockerfile
+          platforms: linux/amd64
+          push: true
+          secrets: |
+            build_github_pat=${{ secrets.GH_TOKEN }}
+          build-args: |
+            GIT_BRANCH=${{ steps.docker_meta.outputs.version }}
+            GIT_COMMIT_ID_ABBREV=${{ env.SHA8 }}
+          tags: ${{ steps.docker_meta.outputs.tags }}
+          # Use runtime labels from docker_meta as well as fixed labels
+          labels: |
+            ${{ steps.docker_meta.outputs.labels }}
+            maintainer=Egill A. Fridgeirsson <[email protected]>
+            org.opencontainers.image.authors=Egill A. Fridgeirsson <[email protected]>, Henrik John <[email protected]>
+            org.opencontainers.image.vendor=OHDSI
+            org.opencontainers.image.licenses=Apache-2.0
+
+      - name: Inspect image
+        run: |
+          docker pull ${{ env.DOCKER_IMAGE }}:${{ steps.docker_meta.outputs.version }}
+          docker image inspect ${{ env.DOCKER_IMAGE }}:${{ steps.docker_meta.outputs.version }}
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,6 @@ extras/
 .Renviron
 inst/python/__pycache__
 .mypy_cache
+/doc/
+/Meta/
+*.pt
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 2.0.3
-Date: 22-12-2023
+Version: 2.1.0
+Date: 08-07-2024
 Authors@R: c(
     person("Egill", "Fridgeirsson", email = "[email protected]", role = c("aut", "cre")),
     person("Jenna", "Reps", email = "[email protected]", role = c("aut")), 
@@ -20,38 +20,36 @@ Depends:
     R (>= 4.0.0)
 Imports:
     dplyr,
-    FeatureExtraction (>= 3.0.0),
     ParallelLogger (>= 2.0.0),
     PatientLevelPrediction (>= 6.3.2),
     rlang,
     withr,
     reticulate (>= 1.31)
 Suggests:
-    devtools,
     Eunomia,
     knitr,
-    markdown,
-    plyr,  
+    rmarkdown,
     testthat,
     PRROC,
+    FeatureExtraction (>= 3.0.0),
     ResultModelManager (>= 0.2.0),
     DatabaseConnector (>= 6.0.0),
     Andromeda
 Remotes:
     ohdsi/PatientLevelPrediction,
-    ohdsi/FeatureExtraction,
-    ohdsi/Eunomia,  
     ohdsi/ResultModelManager
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Encoding: UTF-8
 Config/testthat/edition: 3
+Config/testthat/parallel: TRUE
 Config/reticulate:
   list(
     packages = list(
       list(package = "torch"),
       list(package = "polars"),
       list(package = "tqdm"),
       list(package = "connectorx"),
-      list(package = "pyarrow")
+      list(package = "pyarrow"),
+      list(package = "pynvml")
       )
   )
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,64 @@
+FROM docker.io/rocker/r-ver:4.4.1 AS build
+
+RUN --mount=type=secret,id=build_github_pat export GITHUB_PAT=$(cat /run/secrets/build_github_pat)
+
+ARG GIT_BRANCH='main'
+ARG GIT_COMMIT_ID_ABBREV
+
+RUN apt-get -y update && apt-get install -y \
+      default-jre \
+      default-jdk \
+      libssl-dev  \
+      python3-pip \
+      python3-dev \
+      --no-install-recommends \
+      && apt-get clean \
+      && rm -rf /var/lib/apt/lists/*
+RUN R CMD javareconf
+
+RUN install2.r -n -1 \
+        remotes \
+        CirceR \
+        Eunomia \
+        duckdb \
+    && installGithub.r \
+        OHDSI/CohortGenerator \
+        OHDSI/ROhdsiWebApi \
+        OHDSI/ResultModelManager
+
+RUN Rscript -e "DatabaseConnector::downloadJdbcDrivers(dbms='all', pathToDriver='/database_drivers/')"
+ENV DATABASECONNECTOR_JAR_FOLDER=/database_drivers/
+
+# install Python packages
+RUN pip3 install uv \
+    && uv pip install --system --no-cache-dir \
+    connectorx \
+    polars \
+    pyarrow \
+    torch \
+    tqdm \
+    pynvml \
+    && rm -rf /root/.cache/pip
+
+RUN Rscript -e "ref <- Sys.getenv('GIT_COMMIT_ID_ABBREV', unset = Sys.getenv('GIT_BRANCH')); remotes::install_github('ohdsi/DeepPatientLevelPrediction', ref=ref)"
+
+
+FROM docker.io/rocker/rstudio:4.4.1
+#
+COPY --from=build /usr/local/lib/python3.10/dist-packages /usr/local/lib/python3.10/dist-packages
+COPY --from=build /database_drivers /database_drivers
+COPY --from=build /usr/local/lib/R/site-library /usr/local/lib/R/site-library
+COPY --from=build /usr/local/lib/R/library /usr/local/lib/R/library
+
+ENV RETICULATE_PYTHON=/usr/bin/python3
+# runtime dependanceis
+RUN apt-get -y update && apt-get install -y \
+      default-jre \
+      default-jdk \
+      libssl3 \
+      python3-dev \
+      --no-install-recommends \
+      && apt-get clean \
+      && rm -rf /var/lib/apt/lists/* \
+      && R CMD javareconf
+
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,9 +6,11 @@ export(predictDeepEstimator)
 export(setDefaultResNet)
 export(setDefaultTransformer)
 export(setEstimator)
+export(setFinetuner)
 export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
+export(torch)
 export(trainingCache)
 importFrom(dplyr,"%>%")
 importFrom(reticulate,py_to_r)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,15 @@
+DeepPatientLevelPrediction 2.1.0
+======================
+  - Added basic transfer learning functionality. See vignette("TransferLearning")
+  - Add a gpu memory cleaner to clean cached memory after out of memory error
+  - The python module torch is now accessed through an exported function instead of loading the module at package load
+  - Added gradient accumulation. Studies running at different sites using different hardware can now use same effective batch size by accumulating gradients.
+  - Refactored out the cross validation from the hyperparameter tuning
+  - Remove predictions from non-optimal hyperparameter combinations to save space
+  - Only use html vignettes 
+  - Rename MLP to MultiLayerPerceptron
+
+
 DeepPatientLevelPrediction 2.0.3
 ======================
   - Hotfix: Fix count for polars v0.20.x

diff --git a/R/Dataset.R b/R/Dataset.R
@@ -22,14 +22,21 @@ createDataset <- function(data, labels, plpModel = NULL) {
     # sqlite object
     attributes(data)$path <- attributes(data)$dbname
   }
-  if (is.null(plpModel)) {
+  if (is.null(plpModel) && is.null(data$numericalIndex)) {
     data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
                     r_to_py(labels$outcomeCount))
+  } else if (!is.null(data$numericalIndex)) {
+    numericalIndex <-
+      r_to_py(as.array(data$numericalIndex %>% dplyr::pull()))
+    data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
+                    r_to_py(labels$outcomeCount),
+                    numericalIndex)
   } else {
     numericalFeatures <-
       r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))
     data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
-                    numerical_features = numericalFeatures)
+      numerical_features = numericalFeatures
+    )
   }
 
   return(data)

diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
@@ -21,12 +21,25 @@
 #' @description A package containing deep learning extensions for developing
 #' prediction models using data in the OMOP CDM
 #'
-#' @docType package
 #' @name DeepPatientLevelPrediction
 #' @importFrom dplyr %>%
 #' @importFrom reticulate r_to_py py_to_r
 #' @importFrom rlang .data
-NULL
+"_PACKAGE"
+
+# package level global state
+.globals <- new.env(parent = emptyenv())
+
+#' Pytorch module
+#'
+#' The `torch` module object is the equivalent of
+#' `reticulate::import("torch")` and provided mainly as a convenience.
+#'
+#' @returns the torch Python module
+#' @export
+#' @usage NULL
+#' @format An object of class `python.builtin.module`
+torch <- NULL
 
 .onLoad <- function(libname, pkgname) {
   # use superassignment to update global reference