Merge pull request #410 from ldecicco-USGS/response_to_review

Response to review
DOI-USGS · Oct 2, 2024 · 0096aa5 · 0096aa5
2 parents 9faef98 + 3e59c14
commit 0096aa5
Show file tree

Hide file tree

Showing 52 changed files with 643 additions and 527 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -97,7 +97,5 @@ vignettes/group.png
 vignettes/sites.png
 vignettes/thres.png
 ^docker$
-vignettes/update_data.Rmd
-
-
+^review$
 
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,3 +1,5 @@
+image: ${CI_REGISTRY_IMAGE}:latest
+
 workflow:
   rules:
     - if: $CI_COMMIT_TAG
@@ -29,6 +31,7 @@ variables:
 
 build-image:
   stage: build
+  cache: []
   image: ${DEVOPS_REGISTRY}usgs/docker:20
   services:
   - name: ${DEVOPS_REGISTRY}usgs/docker:20-dind
@@ -53,7 +56,7 @@ build-image:
 
 buildcheck:
   stage: check
-  image: ${CI_REGISTRY_IMAGE}:latest
+  cache: []
   dependencies:
     - build-image
   script:
@@ -66,7 +69,7 @@ buildcheck:
 
 unittests:
   stage: test
-  image: ${CI_REGISTRY_IMAGE}:latest
+  cache: []
   dependencies:
     - build-image
     - buildcheck
@@ -82,7 +85,6 @@ unittests:
 
 covertests:
   stage: test
-  image: ${CI_REGISTRY_IMAGE}:latest
   dependencies:
     - build-image
     - buildcheck
@@ -98,7 +100,6 @@ covertests:
 
 pages:
   stage: end
-  image: ${CI_REGISTRY_IMAGE}:latest
   only: 
     - main
   script:
@@ -112,6 +113,8 @@ pages:
 Validate Inventory:
     stage: end
     image: ${INTERNAL_REGISTRY}software/software-management:latest
+    rules:
+      - if:  $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
     script:
       - software-management review
         --project "${CI_PROJECT_PATH}"

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: toxEval
 Type: Package
 Title: Exploring Biological Relevance of Environmental Chemistry Observations
-Version: 1.3.2
+Version: 1.4.0
 Authors@R: c(person("Laura", "DeCicco", 
                      role = c("aut","cre"),
                      email = "[email protected]",
@@ -33,7 +33,7 @@ Copyright: This software is in the public domain because it contains materials
     official USGS copyright policy at
     https://www.usgs.gov/visual-id/credit_usgs.html#copyright
 Depends:
-    R (>= 3.5.0)
+    R (>= 4.1.0)
 Imports:
     dplyr,
     tidyr,
@@ -60,5 +60,4 @@ BugReports: https://github.com/DOI-USGS/toxEval/issues
 VignetteBuilder: knitr
 BuildVignettes: true
 LazyLoad: yes
-RoxygenNote: 7.3.1
-
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
@@ -11,6 +11,7 @@ export(endpoint_hits)
 export(endpoint_hits_DT)
 export(explore_endpoints)
 export(filter_groups)
+export(flags)
 export(get_ACC)
 export(get_chemical_summary)
 export(get_concentration_summary)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,9 @@
+toxEval 1.4.0
+===========
+* Switched to version 4.1 of the ToxCast database
+* Added a default flag to remove in remove_flags
+
+
 toxEval 1.3.1
 ===========
 * Made "Chemical" a required column in the Chemical tab. Now all plot names will key off that column instead of the names listed in tox_chemicals.

diff --git a/R/clean_endPoint_info.R b/R/clean_endPoint_info.R
@@ -1,13 +1,7 @@
 #' clean_endPoint_info
 #'
-#' Define a subset of the ToxCast database for relevance to toxEval analyses.
-#' Subsetting is done based upon methods defined by Blackwell et al., 2017 (
-#' \doi{10.1021/acs.est.7b01613}).
-#' Specifically, this function removes endPoints that are ATG sources with
-#' signal loss, and NVS with signal gain (basically: some assay/signal combinations
-#' are removed because they target non-specific endpoints). Also, this function adds additional
-#' categories to intended_target_family and intended_target_family_sub as
-#' described in the paper linked above.
+#' As of ToxCast 4.1, this function only helps clean up abbrieviations 
+#' found in the end_point_info data frame.
 #'
 #' @param end_point_info Data frame Endpoint information from ToxCast.
 #' @export
@@ -21,9 +15,7 @@
 #' cleaned_ep <- clean_endPoint_info(end_point_info)
 #' nrow(cleaned_ep)
 clean_endPoint_info <- function(end_point_info) {
-  end_point_info <- end_point_info[!(end_point_info$assay_source_name == "ATG" & end_point_info$signal_direction == "loss"), ]
-  end_point_info <- end_point_info[!(end_point_info$assay_source_name == "NVS" & end_point_info$signal_direction == "gain"), ]
-
+
   end_point_info$intended_target_family[end_point_info$assay_component_endpoint_name %in%
     c(
       "CLD_CYP1A1_24hr", "CLD_CYP1A1_48hr", "CLD_CYP1A1_6hr",

diff --git a/R/create_toxEval.R b/R/create_toxEval.R
@@ -234,17 +234,17 @@ summary.toxEval <- function(object, ...) {
 
   if (is.null(object[["benchmarks"]])) {
     ACC <- ToxCast_ACC %>%
-      dplyr::filter(CAS %in% unique(object$chem_info$CAS))
+      dplyr::filter(casn %in% unique(object$chem_info$CAS))
     bench_word <- "ToxCast"
   } else {
     ACC <- object[["benchmarks"]]
     bench_word <- "benchmark"
   }
 
   CAS_w_data <- ACC %>%
-    dplyr::select(CAS) %>%
+    dplyr::select(casn) %>%
     dplyr::distinct() %>%
-    dplyr::pull(CAS)
+    dplyr::pull(casn)
 
   message(length(CAS_w_data), " chemicals have ", bench_word, " information")
   message("Chemicals returned from this function do NOT have ", bench_word, " information:")

diff --git a/R/filter_endPoint_info.R b/R/filter_endPoint_info.R
@@ -4,7 +4,7 @@
 #' supplied data frame \code{\link{end_point_info}} to be used in subsequent analysis steps.
 #' First, the user specifies the ToxCast assay annotation using the 'groupCol'
 #' argument, which is a column header in 'end_point_info'. Second, the user
-#' specifies the families of assays to use. Finally, the user can choose to
+#' specifies the families of assays to exclude. Finally, the user can choose to
 #' remove specific group(s) from the category. The default is to remove
 #' 'Background Measurement' and 'Undefined'. Choices for this should be
 #' reconsidered based on individual study objectives.
@@ -17,11 +17,8 @@
 #'
 #' @param ep Data frame containing Endpoint information from ToxCast
 #' @param groupCol Character name of ToxCast annotation column to use as a group category
-#' @param assays Vector of assays to use in the data analysis. Possible values are "ACEA", "APR", "ATG",
-#' "NVS", "OT", "TOX21", "CEETOX", "LTEA", "CLD", "TANGUAY", "CCTE_PADILLA", "BSK" ,
-#' "CCTE", "STM", "ARUNA", "CCTE_SHAFER", "CPHEA_STOKER", "CCTE_GLTED", "UPITT", "UKN",
-#' "ERF", "TAMU", "IUF", "CCTE_MUNDY", "UTOR", "VALA". By default, the
-#' "BSK" (BioSeek) assay is removed.
+#' @param remove_assays Vector of assays to EXCLUDE in the data analysis. 
+#' By default, the "BSK" (BioSeek) assay is removed.
 #' @param remove_groups Vector of groups within the selected 'groupCol' to remove.
 #' @export
 #' @examples
@@ -31,18 +28,11 @@
 #' head(filtered_ep)
 filter_groups <- function(ep,
                           groupCol = "intended_target_family",
-                          assays = c(
-                            "ACEA", "APR", "ATG",
-                            "NVS", "OT", "TOX21", "CEETOX",
-                            "LTEA", "CLD", "TANGUAY", "CCTE_PADILLA",
-                            "CCTE", "STM", "ARUNA", "CCTE_SHAFER",
-                            "CPHEA_STOKER", "CCTE_GLTED", "UPITT", "UKN",
-                            "ERF", "TAMU", "IUF", "CCTE_MUNDY",
-                            "UTOR", "VALA"
-                          ),
+                          remove_assays = c("BSK"),
                           remove_groups = c("Background Measurement", "Undefined")) {
+
   possible_assays <- unique(end_point_info$assay_source_name)
-  match.arg(assays, possible_assays, several.ok = TRUE)
+  match.arg(remove_assays, possible_assays, several.ok = TRUE)
 
   # Getting rid of NSE warnings:
   assay_source_name <- assay_component_endpoint_name <- ".dplyr"
@@ -54,7 +44,7 @@ filter_groups <- function(ep,
     )
   names(ep)[names(ep) == groupCol] <- "groupCol"
 
-  ep <- ep[(ep$assaysFull %in% assays), ]
+  ep <- ep[!(ep$assaysFull %in% remove_assays), ]
   ep <- ep[!is.na(ep$groupCol), ]
   if (any(!is.na(remove_groups))) {
     if (any(remove_groups != "")) {

diff --git a/R/get_ACC.R b/R/get_ACC.R
@@ -3,10 +3,8 @@
 #' The \code{get_ACC} function retrieves the activity concentration at cutoff
 #' (ACC) values for specified chemicals.
 #'
-#' The data used in toxEval were combined from files in the
-#' "INVITRODB_V3_LEVEL5" directory that were included in the October 2018
-#' release of the ToxCast database. The function \code{get_ACC} will
-#' convert the ACC values in the ToxCast database from units of (log \eqn{\mu}M)
+#' The function \code{get_ACC} will
+#' convert the ACC values in the ToxCast database from units of (\eqn{\mu}M)
 #' to units of \eqn{\mu}g/L, and reformat the data as input to toxEval.
 #'
 #' @param CAS Vector of CAS.
@@ -19,27 +17,25 @@
 #' head(ACC)
 get_ACC <- function(CAS) {
 
-  chem_list <- dplyr::select(tox_chemicals,
-    casrn = Substance_CASRN,
-    MlWt = Structure_MolWt
-  )
-  chem_list <- dplyr::filter(chem_list, casrn %in% CAS)
+  chem_list <- tox_chemicals |> 
+    dplyr::select(casrn = casn,
+                  MlWt = Structure_MolWt) |> 
+    dplyr::filter(casrn %in% CAS)
 
-  ACC <- ToxCast_ACC
-  ACC <- dplyr::filter(ACC, CAS %in% CAS)
-  ACC <- dplyr::right_join(ACC, chem_list,
-    by = c("CAS" = "casrn")
-  )
-
-  ACC <- dplyr::mutate(ACC,
-    ACC_value = 10^ACC,
-    ACC_value = ACC_value * MlWt
-  )
-  ACC <- dplyr::filter(ACC, !is.na(ACC_value))
-  ACC <- dplyr::left_join(ACC, dplyr::select(tox_chemicals,
-    CAS = Substance_CASRN,
-    chnm = Substance_Name
-  ), by = "CAS")
+  ACC <- ToxCast_ACC |> 
+    dplyr::filter(casn %in% CAS) |> 
+    dplyr::right_join(chem_list, by = c("casn" = "casrn")) |> 
+    dplyr::rename(CAS = casn) |> 
+    dplyr::mutate(ACC_value = hit_val * MlWt) |> 
+    dplyr::filter(!is.na(ACC_value)) |> 
+    dplyr::left_join(dplyr::select(tox_chemicals,
+                                  CAS = casn,
+                                  chnm = chnm),
+                     by = "CAS") |> 
+    dplyr::left_join(end_point_info |> 
+                       dplyr::select(aeid,
+                                     endPoint = assay_component_endpoint_name),
+                     by = "aeid")
 
   if (any(is.na(ACC$MlWt))) {
     warning("Some chemicals are missing molecular weights")