Merge pull request #3 from Boehringer-Ingelheim/rc

dv.loader release v2.0.0
Boehringer-Ingelheim · Jul 8, 2024 · f724639 · f724639
2 parents c095304 + 3827f0c
commit f724639
Show file tree

Hide file tree

Showing 18 changed files with 723 additions and 170 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -7,3 +7,4 @@
 ^\.github
 ^\.lintr$
 ^NEWS\.md$
+^inst/validation/results/\.gitempty$
diff --git a/.lintr b/.lintr
@@ -0,0 +1,6 @@
+linters: linters_with_defaults(
+  line_length_linter(120),
+  object_usage_linter = NULL,
+  indentation_linter = NULL,
+  trailing_whitespace_linter = NULL
+  )
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: dv.loader
 Type: Package
 Title: Data loading module 
-Version: 1.1.1
+Version: 2.0.0
 Authors@R: c(
         person( "Boehringer-Ingelheim Pharma GmbH & Co.KG", role = c("cph", "fnd")),
         person( given = "Ming", family = "Yang", role = c("aut", "cre"), email = "[email protected]"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# dv.loader 2.0.0
+
+- GitHub release with QC report
+- Update package documentation
+
 # dv.loader 1.1.1
 
 - General package maintenance.

diff --git a/R/dvloader.R b/R/dvloader.R
@@ -18,12 +18,12 @@ get_nfs_path <- function() {
 get_cre_path <- get_nfs_path
 
 #' Loads data into memory based on study directory and one or more file_names.
-#' @param sub_dir Study directory, which will be appended to its internal base_path.
-#' If left as NULL, it will use the working directory as the sub_dir.
+#' @param sub_dir A relative directory/folder that will be appended to a base path defined by `Sys.getenv("RXD_DATA")`.
+#' If the argument is left as NULL, the function will load data from the working directory `getwd()`.
 #' @param file_names Study file or file_names name(s) - can be a vector of strings.
 #' This is the only required argument.
 #' @param use_wd for "use working directory" - a flag used when importing local files
-#' not on NFS - default value is False
+#' not on NFS - default value is FALSE
 #' @param prefer_sas if set to TRUE, imports sas7bdat files first before looking for
 #' RDS files (the opposite of default behavior)
 #' @return a list of dataframes

diff --git a/README.md b/README.md
@@ -1,28 +1,30 @@
-# Data Loading Module
-
-[![Checks 🧩](https://github.com/Boehringer-Ingelheim/dv.loader/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/Boehringer-Ingelheim/dv.loader/actions/workflows/ci.yml)
+# Data Loading
+
+The {dv.loader} package provides a simple interface for loading data from a network file storage folder or 
+locally. It is designed to be used with `.RDS` and `.sas7bdat` file formats.
+The package provides a simple function, `load_data()`, which loads R and SAS data files into memory. 
+Loading data from SQL databases is not yet supported. The function returns a list named by the file names passed, 
+and containing data frames, along with metadata for that table. By default, the function will look for files in a
+sub-directory `sub_dir` of the base path defined by a environment variable "RXD_DATA". You can check if the base path
+is set by running `Sys.getenv("RXD_DATA")`. A single file or multiple files can be loaded at once. 
+To make the loading process faster for large datasets, it is suggested that '.sas7bdat' files are converted to 
+'.RDS' files. The function will prefer '.RDS' files over '.sas7bdat' files by default.
 
 ## Installation
 
 ```r
-install.packages("devtools") # if you have not installed "devtools" package
-devtools::install_github("Boehringer-Ingelheim/dv.loader")
+if (!require("remotes")) install.packages("remotes")
+remotes::install_github("Boehringer-Ingelheim/dv.loader")
 ```
 
 ## Basic usage
 
-**Note**: `dv.loader` is only designed to be used with `.RDS` and `.sas7bdat` file formats.
-
 ```r
 # getting data from a network file storage folder
 dv.loader::load_data(sub_dir = "subdir1/subdir2", file_names = c("adsl", "adae"))
 ```
 
 ```r
 # getting data locally (e.g., if you have file `./data/adsl.RDS`)
-dv.loader::load_data(sub_dir = "data", file_names = c("adsl"), use_wd = T)
+dv.loader::load_data(sub_dir = "data", file_names = c("adsl"), use_wd = TRUE)
 ```
-
-## Contact
-
-If you have edits or suggestions, please open a PR.
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -1,10 +1,16 @@
----
-url: https://boehringer-ingelheim.github.io/dv.loader
-
 template:
   bootstrap: 5
 
 navbar:
-  right:
-    - icon: fa-github
-      href: https://github.com/boehringer-ingelheim/dv.loader
+  type: inverse
+  structure:
+    left: [intro, reference, articles, tutorials, news, qc]
+  components:
+    qc:
+      text: Quality Control
+      href: articles/qc.html
+home:
+  title: dv.loader
+  links:
+  - text: Browse source code
+    href: https://github.com/Boehringer-Ingelheim/dv.loader
diff --git a/dv.loader.Rproj b/dv.loader.Rproj
@@ -15,4 +15,4 @@ LaTeX: pdfLaTeX
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
-PackageRoxygenize: rd,collate,namespace
+PackageRoxygenize: rd,collate,namespace
diff --git a/inst/validation/results/.gitempty b/inst/validation/results/.gitempty
diff --git a/inst/validation/run_validation.R b/inst/validation/run_validation.R
@@ -0,0 +1,44 @@
+pkg_name <- read.dcf("DESCRIPTION")[, "Package"]
+pkg_version <- read.dcf("DESCRIPTION")[, "Version"]
+test_results <- tibble::as_tibble(devtools::test())
+
+local({
+  # This is evaluated inside a local because, otherwise, all the variables created in the chunks of the rendered
+  # document leak into the environment
+
+  validation_root <- "./inst/validation"
+  validation_report_rmd <- file.path(validation_root, "val_report.Rmd")
+  validation_report_html <- "val_report.html"
+  validation_results <- file.path(validation_root, "results")
+  val_param_rds <- file.path(validation_results, "val_param.rds")
+
+  stopifnot(dir.exists(validation_root))
+  stopifnot(file.exists(validation_report_rmd))
+
+  stopifnot(dir.exists(validation_results))
+  unlink(list.files(validation_results))
+
+  saveRDS(
+    list(
+      package = pkg_name,
+      tests = test_results,
+      version = pkg_version
+    ),
+    val_param_rds
+  )
+
+  rmarkdown::render(
+    input = validation_report_rmd,
+    params = list(
+      package = pkg_name,
+      tests = test_results,
+      version = pkg_version
+    ),
+    output_dir = validation_results,
+    output_file = validation_report_html
+  )
+
+  # We use one of the leaked variables, created inside the validation report to asses if the validation is
+  # succesful or not
+  VALIDATION_PASSED
+})
diff --git a/inst/validation/specs.R b/inst/validation/specs.R
@@ -0,0 +1,13 @@
+# Use a list to declare the specs
+
+specs_list <- list
+
+specs <- specs_list(
+  "default_dir" = "loads data from the working directory by default",
+  "file_names" = "returns an error if file_names is missing",
+  "file_type" = "returns an error if the file type is not supported",
+  "file_extensions" = "checks for valid file extensions",
+  "data_integrity" = "ensures data is loaded correctly",
+  "meta_data" = "reads metadata from the items of file.info()",
+  "prefer_sas" = "loads a SAS or RDS file based on the prefer_sas flag"
+)
diff --git a/inst/validation/utils-validation.R b/inst/validation/utils-validation.R
@@ -0,0 +1,155 @@
+#' Setting up the validation
+
+if (!exists("package_name")) stop("package name must be in the environment when this script is sourced")
+
+#' How to link tests and specs
+
+if (FALSE) {
+  test_that(
+    vdoc[["add_spec"]]("my test description", specs$a_spec),
+    {
+      expect_true(TRUE)
+    }
+  )
+}
+#' The specs variable on the call references the one declared in specs.R
+
+#' 3. For those tests covering more than one spec.
+#' NOTE: It must be c() and not list()
+#'
+
+if (FALSE) {
+  test_that(
+    vdoc[["add_spec"]]("my test_description", c(specs$my$hier$spec, vdoc_specs$my$hier$other_spec)),
+    {
+      expect_true(TRUE)
+    }
+  )
+}
+
+#' Considerations:
+#' - parse_spec uses deparse(substitute()). These spec_ids are later used to check if all requirements
+#'  are covered or not, therefore those calls cannot by substituted for:
+
+if (FALSE) {
+  my_spec <- specs$my$hier$spec
+  test_that(vdoc[["add_spec"]]("my test_description", my_spec), {
+    ...
+  })
+
+  test_that(vdoc[["add_spec"]]("my test_description", specs[["my"]][["hier"]][["spec"]]), {
+    ...
+  })
+}
+
+# In this case the substitute captures my_spec and cannot be used later.
+# If you want to do this you must use the spec_id parameter where you pass a
+# character vector with the ids.
+# Notice that the ids in character form do no longer have the specs particle
+# at the beginning, only the pathing of the spec is needed.
+
+if (FALSE) {
+  my_spec <- specs$my$hier$spec
+  test_that(vdoc$parse_spec(my_spec, "my test_description", spec_id = c("my$hier$spec")), {
+    ...
+  })
+}
+
+# Validation code
+# nolint start cyclocomp_linter
+local({
+  specs <- source(
+    system.file("validation", "specs.R", package = package_name, mustWork = TRUE),
+    local = TRUE
+  )[["value"]]
+  recursive_ids <- function(x, parent = character(0)) {
+    if (!is.list(x)) {
+      return(parent)
+    }
+    unlist(mapply(recursive_ids,
+                  x,
+                  paste(parent, names(x),
+                        sep = if (identical(parent, character(0))) "" else "$"
+                  ),
+                  SIMPLIFY = FALSE, USE.NAMES = FALSE
+    ))
+  }
+
+  recursive_ids <- function(x, parent = character(0)) {
+    if (!is.list(x)) {
+      return(parent)
+    }
+    unlist(mapply(recursive_ids, x,
+                  paste(parent, names(x),
+                        sep = if (identical(parent, character(0))) "" else "$"
+                  ),
+                  SIMPLIFY = FALSE, USE.NAMES = FALSE
+    ))
+  }
+
+
+  spec_id_list <- recursive_ids(specs)
+
+  list(
+    specs = specs,
+    spec_id_list = spec_id_list,
+    add_spec = function(desc, spec, spec_id) {
+      if (missing(spec_id)) {
+        if (!is.character(spec) || length(spec) == 0) stop("spec must be a non-empty character vector")
+        s_spec <- substitute(spec)
+        if (s_spec[[1]] == "c") {
+          spec_id <- sapply(s_spec[2:length(s_spec)], identity)
+        } else {
+          spec_id <- list(s_spec) # Otherwise the posterior vapply iterates over the expression
+        }
+
+        spec_id_chr <- vapply(spec_id, function(x) {
+          sub("^[^$]*\\$", "", deparse(x))
+        }, FUN.VALUE = character(1))
+
+        if (!all(spec_id_chr %in% spec_id_list)) {
+          stop("At least one spec is not declared in the spec list")
+        } # This should be covered by pack of constants but just in case
+      } else {
+        spec_id_chr <- spec_id
+      }      
+      paste0(desc, "__spec_ids{", paste0(spec_id_chr, collapse = ";"), "}")
+    },
+    get_spec = function(test, specs) {
+      spec_ids <- utils::strcapture(
+        pattern = "__spec_ids\\{(.*)\\}",
+        x = test,
+        proto = list(spec = character())
+      )[["spec"]]
+
+      spec_ids <- strsplit(spec_ids, split = ";")
+
+      specs_and_id <- list()
+
+      for (idx in seq_along(spec_ids)){        
+        ids <- spec_ids[[idx]]        
+        if (all(!is.na(ids))) {
+          this_specs <- list()
+          for (sub_idx in seq_along(ids)) {
+            id <- ids[[sub_idx]]       
+            this_specs[[sub_idx]] <- eval(str2expression(paste0("specs$", id)))
+          }
+          specs_and_id[[idx]] <- list(
+            spec_id = ids,
+            spec = this_specs
+          )
+        } else {
+          specs_and_id[[idx]] <- list(
+            spec_id = NULL,
+            spec = NULL
+          )          
+        }        
+      }      
+      specs_and_id
+    }
+
+
+  )
+})
+
+# nolint end cyclocomp_linter
diff --git a/inst/validation/val_report.Rmd b/inst/validation/val_report.Rmd
@@ -0,0 +1,17 @@
+---
+title: "Quality Control"
+output:
+  html_document:
+    toc: true
+    toc_depth: 2
+    code_folding: hide
+toc-title: "----\nIndex"
+
+params:
+  package: NULL
+  tests: NULL
+  version: NULL
+---
+
+```{r, child = "val_report_child.Rmd"}
+```