diff --git a/.Rbuildignore b/.Rbuildignore index ba431889..5be24097 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -97,7 +97,5 @@ vignettes/group.png vignettes/sites.png vignettes/thres.png ^docker$ -vignettes/update_data.Rmd - - +^review$ diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4bc247c3..9f98cb53 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,3 +1,5 @@ +image: ${CI_REGISTRY_IMAGE}:latest + workflow: rules: - if: $CI_COMMIT_TAG @@ -29,6 +31,7 @@ variables: build-image: stage: build + cache: [] image: ${DEVOPS_REGISTRY}usgs/docker:20 services: - name: ${DEVOPS_REGISTRY}usgs/docker:20-dind @@ -53,7 +56,7 @@ build-image: buildcheck: stage: check - image: ${CI_REGISTRY_IMAGE}:latest + cache: [] dependencies: - build-image script: @@ -66,7 +69,7 @@ buildcheck: unittests: stage: test - image: ${CI_REGISTRY_IMAGE}:latest + cache: [] dependencies: - build-image - buildcheck @@ -82,7 +85,6 @@ unittests: covertests: stage: test - image: ${CI_REGISTRY_IMAGE}:latest dependencies: - build-image - buildcheck @@ -98,7 +100,6 @@ covertests: pages: stage: end - image: ${CI_REGISTRY_IMAGE}:latest only: - main script: @@ -112,6 +113,8 @@ pages: Validate Inventory: stage: end image: ${INTERNAL_REGISTRY}software/software-management:latest + rules: + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH script: - software-management review --project "${CI_PROJECT_PATH}" diff --git a/DESCRIPTION b/DESCRIPTION index 15037537..0439b9b3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: toxEval Type: Package Title: Exploring Biological Relevance of Environmental Chemistry Observations -Version: 1.3.2 +Version: 1.4.0 Authors@R: c(person("Laura", "DeCicco", role = c("aut","cre"), email = "ldecicco@usgs.gov", @@ -33,7 +33,7 @@ Copyright: This software is in the public domain because it contains materials official USGS copyright policy at https://www.usgs.gov/visual-id/credit_usgs.html#copyright Depends: - R (>= 3.5.0) + R (>= 4.1.0) Imports: dplyr, tidyr, @@ -60,5 +60,4 @@ BugReports: https://github.com/DOI-USGS/toxEval/issues VignetteBuilder: knitr BuildVignettes: true LazyLoad: yes -RoxygenNote: 7.3.1 - +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 6b9dece1..4dc72f32 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,7 @@ export(endpoint_hits) export(endpoint_hits_DT) export(explore_endpoints) export(filter_groups) +export(flags) export(get_ACC) export(get_chemical_summary) export(get_concentration_summary) diff --git a/NEWS b/NEWS index d12f0f25..6f88c569 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,9 @@ +toxEval 1.4.0 +=========== +* Switched to version 4.1 of the ToxCast database +* Added a default flag to remove in remove_flags + + toxEval 1.3.1 =========== * Made "Chemical" a required column in the Chemical tab. Now all plot names will key off that column instead of the names listed in tox_chemicals. diff --git a/R/clean_endPoint_info.R b/R/clean_endPoint_info.R index dfff07e9..b89749e4 100644 --- a/R/clean_endPoint_info.R +++ b/R/clean_endPoint_info.R @@ -1,13 +1,7 @@ #' clean_endPoint_info #' -#' Define a subset of the ToxCast database for relevance to toxEval analyses. -#' Subsetting is done based upon methods defined by Blackwell et al., 2017 ( -#' \doi{10.1021/acs.est.7b01613}). -#' Specifically, this function removes endPoints that are ATG sources with -#' signal loss, and NVS with signal gain (basically: some assay/signal combinations -#' are removed because they target non-specific endpoints). Also, this function adds additional -#' categories to intended_target_family and intended_target_family_sub as -#' described in the paper linked above. +#' As of ToxCast 4.1, this function only helps clean up abbrieviations +#' found in the end_point_info data frame. #' #' @param end_point_info Data frame Endpoint information from ToxCast. #' @export @@ -21,9 +15,7 @@ #' cleaned_ep <- clean_endPoint_info(end_point_info) #' nrow(cleaned_ep) clean_endPoint_info <- function(end_point_info) { - end_point_info <- end_point_info[!(end_point_info$assay_source_name == "ATG" & end_point_info$signal_direction == "loss"), ] - end_point_info <- end_point_info[!(end_point_info$assay_source_name == "NVS" & end_point_info$signal_direction == "gain"), ] - + end_point_info$intended_target_family[end_point_info$assay_component_endpoint_name %in% c( "CLD_CYP1A1_24hr", "CLD_CYP1A1_48hr", "CLD_CYP1A1_6hr", diff --git a/R/create_toxEval.R b/R/create_toxEval.R index b03be1d0..f50c128c 100644 --- a/R/create_toxEval.R +++ b/R/create_toxEval.R @@ -234,7 +234,7 @@ summary.toxEval <- function(object, ...) { if (is.null(object[["benchmarks"]])) { ACC <- ToxCast_ACC %>% - dplyr::filter(CAS %in% unique(object$chem_info$CAS)) + dplyr::filter(casn %in% unique(object$chem_info$CAS)) bench_word <- "ToxCast" } else { ACC <- object[["benchmarks"]] @@ -242,9 +242,9 @@ summary.toxEval <- function(object, ...) { } CAS_w_data <- ACC %>% - dplyr::select(CAS) %>% + dplyr::select(casn) %>% dplyr::distinct() %>% - dplyr::pull(CAS) + dplyr::pull(casn) message(length(CAS_w_data), " chemicals have ", bench_word, " information") message("Chemicals returned from this function do NOT have ", bench_word, " information:") diff --git a/R/filter_endPoint_info.R b/R/filter_endPoint_info.R index f78db242..f4b92e50 100644 --- a/R/filter_endPoint_info.R +++ b/R/filter_endPoint_info.R @@ -4,7 +4,7 @@ #' supplied data frame \code{\link{end_point_info}} to be used in subsequent analysis steps. #' First, the user specifies the ToxCast assay annotation using the 'groupCol' #' argument, which is a column header in 'end_point_info'. Second, the user -#' specifies the families of assays to use. Finally, the user can choose to +#' specifies the families of assays to exclude. Finally, the user can choose to #' remove specific group(s) from the category. The default is to remove #' 'Background Measurement' and 'Undefined'. Choices for this should be #' reconsidered based on individual study objectives. @@ -17,11 +17,8 @@ #' #' @param ep Data frame containing Endpoint information from ToxCast #' @param groupCol Character name of ToxCast annotation column to use as a group category -#' @param assays Vector of assays to use in the data analysis. Possible values are "ACEA", "APR", "ATG", -#' "NVS", "OT", "TOX21", "CEETOX", "LTEA", "CLD", "TANGUAY", "CCTE_PADILLA", "BSK" , -#' "CCTE", "STM", "ARUNA", "CCTE_SHAFER", "CPHEA_STOKER", "CCTE_GLTED", "UPITT", "UKN", -#' "ERF", "TAMU", "IUF", "CCTE_MUNDY", "UTOR", "VALA". By default, the -#' "BSK" (BioSeek) assay is removed. +#' @param remove_assays Vector of assays to EXCLUDE in the data analysis. +#' By default, the "BSK" (BioSeek) assay is removed. #' @param remove_groups Vector of groups within the selected 'groupCol' to remove. #' @export #' @examples @@ -31,18 +28,11 @@ #' head(filtered_ep) filter_groups <- function(ep, groupCol = "intended_target_family", - assays = c( - "ACEA", "APR", "ATG", - "NVS", "OT", "TOX21", "CEETOX", - "LTEA", "CLD", "TANGUAY", "CCTE_PADILLA", - "CCTE", "STM", "ARUNA", "CCTE_SHAFER", - "CPHEA_STOKER", "CCTE_GLTED", "UPITT", "UKN", - "ERF", "TAMU", "IUF", "CCTE_MUNDY", - "UTOR", "VALA" - ), + remove_assays = c("BSK"), remove_groups = c("Background Measurement", "Undefined")) { + possible_assays <- unique(end_point_info$assay_source_name) - match.arg(assays, possible_assays, several.ok = TRUE) + match.arg(remove_assays, possible_assays, several.ok = TRUE) # Getting rid of NSE warnings: assay_source_name <- assay_component_endpoint_name <- ".dplyr" @@ -54,7 +44,7 @@ filter_groups <- function(ep, ) names(ep)[names(ep) == groupCol] <- "groupCol" - ep <- ep[(ep$assaysFull %in% assays), ] + ep <- ep[!(ep$assaysFull %in% remove_assays), ] ep <- ep[!is.na(ep$groupCol), ] if (any(!is.na(remove_groups))) { if (any(remove_groups != "")) { diff --git a/R/get_ACC.R b/R/get_ACC.R index c5cb7e1d..e623deb6 100644 --- a/R/get_ACC.R +++ b/R/get_ACC.R @@ -3,10 +3,8 @@ #' The \code{get_ACC} function retrieves the activity concentration at cutoff #' (ACC) values for specified chemicals. #' -#' The data used in toxEval were combined from files in the -#' "INVITRODB_V3_LEVEL5" directory that were included in the October 2018 -#' release of the ToxCast database. The function \code{get_ACC} will -#' convert the ACC values in the ToxCast database from units of (log \eqn{\mu}M) +#' The function \code{get_ACC} will +#' convert the ACC values in the ToxCast database from units of (\eqn{\mu}M) #' to units of \eqn{\mu}g/L, and reformat the data as input to toxEval. #' #' @param CAS Vector of CAS. @@ -19,27 +17,25 @@ #' head(ACC) get_ACC <- function(CAS) { - chem_list <- dplyr::select(tox_chemicals, - casrn = Substance_CASRN, - MlWt = Structure_MolWt - ) - chem_list <- dplyr::filter(chem_list, casrn %in% CAS) + chem_list <- tox_chemicals |> + dplyr::select(casrn = casn, + MlWt = Structure_MolWt) |> + dplyr::filter(casrn %in% CAS) - ACC <- ToxCast_ACC - ACC <- dplyr::filter(ACC, CAS %in% CAS) - ACC <- dplyr::right_join(ACC, chem_list, - by = c("CAS" = "casrn") - ) - - ACC <- dplyr::mutate(ACC, - ACC_value = 10^ACC, - ACC_value = ACC_value * MlWt - ) - ACC <- dplyr::filter(ACC, !is.na(ACC_value)) - ACC <- dplyr::left_join(ACC, dplyr::select(tox_chemicals, - CAS = Substance_CASRN, - chnm = Substance_Name - ), by = "CAS") + ACC <- ToxCast_ACC |> + dplyr::filter(casn %in% CAS) |> + dplyr::right_join(chem_list, by = c("casn" = "casrn")) |> + dplyr::rename(CAS = casn) |> + dplyr::mutate(ACC_value = hit_val * MlWt) |> + dplyr::filter(!is.na(ACC_value)) |> + dplyr::left_join(dplyr::select(tox_chemicals, + CAS = casn, + chnm = chnm), + by = "CAS") |> + dplyr::left_join(end_point_info |> + dplyr::select(aeid, + endPoint = assay_component_endpoint_name), + by = "aeid") if (any(is.na(ACC$MlWt))) { warning("Some chemicals are missing molecular weights") diff --git a/R/get_chemical_summary.R b/R/get_chemical_summary.R index 05bceb32..2521dfb0 100644 --- a/R/get_chemical_summary.R +++ b/R/get_chemical_summary.R @@ -90,10 +90,12 @@ get_chemical_summary <- function(tox_list, ACC = NULL, filtered_ep = "All", chem_data$Value <- as.numeric(chem_data$Value) } - chemical_summary <- dplyr::full_join(ACC, + chemical_summary <- dplyr::full_join(dplyr::distinct(ACC), dplyr::select(chem_data, - CAS, SiteID, Value, `Sample Date`), - by = "CAS") %>% + CAS, SiteID, Value, `Sample Date`) %>% + dplyr::filter(!is.na(CAS)), + by = "CAS", + relationship = "many-to-many") %>% dplyr::filter( !is.na(ACC_value), !is.na(Value)) %>% @@ -117,7 +119,7 @@ get_chemical_summary <- function(tox_list, ACC = NULL, filtered_ep = "All", dplyr::left_join(dplyr::distinct(dplyr::select(chem_site, site = SiteID, `Short Name`)), by = "site" ) %>% - dplyr::left_join(dplyr::select(chem_info, CAS, Class), by = "CAS") %>% + dplyr::left_join(dplyr::distinct(dplyr::select(chem_info, CAS, Class)), by = "CAS") %>% dplyr::rename( Bio_category = groupCol, shortName = `Short Name` @@ -208,85 +210,52 @@ orderEP <- function(graphData) { #' \code{remove_flags} function. The flags included in ToxCast, and the associated #' flagsShort value (used in the remove_flags function) are as follows: #' \tabular{ll}{ -#' \strong{Flag} \tab \strong{flagsShort}\cr -#' Borderline active* \tab Borderline* \cr -#' Only highest conc above baseline, active* \tab OnlyHighest* \cr -#' Only one conc above baseline, active \tab OneAbove \cr -#' Noisy data \tab Noisy \cr -#' Hit-call potentially confounded by overfitting \tab HitCall \cr -#' Gain AC50 < lowest conc & loss AC50 < mean conc* \tab GainAC50* \cr -#' Biochemical assay with < 50\% efficacy* \tab Biochemical* \cr -#' Less than 50\% efficacy \tab LessThan50 \cr -#' AC50 less than lowest concentration tested* \tab ACCLessThan* \cr -#' GNLSmodel \tab GNLSmodel \cr +#' \strong{flag_id} \tab \strong{Full Name}\cr +#'5* \tab Model directionality questionable \cr +#'6* \tab Only highest conc above baseline, active \cr +#'7 \tab Only one conc above baseline, active \cr +#'8 \tab Multiple points above baseline, inactive \cr +#'9 \tab Bmd > ac50, indication of high baseline variability \cr +#'10 \tab Noisy data \cr +#'11* \tab Borderline \cr +#'15* \tab Gain AC50 < lowest conc & loss AC50 < mean conc \cr +#'17 \tab Less than 50\% efficacy \cr +#'18* \tab AC50 less than lowest concentration tested \cr +#'13 \tab Average number of replicates per conc is less than 2 \cr +#'14 \tab Number of concentrations tested is less than 4 \cr +#'19 \tab Cell viability assay fit with gnls winning model \cr #' } #' Asterisks indicate flags removed in the function as default. #' #' #' @param ACC data frame with columns: casn, chnm, endPoint, and ACC_value -#' @param flagsShort vector of flags to to trigger REMOVAL of chemical:endPoint -#' combination. Possible values are "Borderline", "OnlyHighest", "OneAbove", -#' "Noisy", "HitCall", "GainAC50", "Biochemical","LessThan50","ACCLessThan","GNLSmodel". +#' @param flag_id vector of flags to to trigger REMOVAL #' @export #' @examples #' CAS <- c("121-00-6", "136-85-6", "80-05-7", "84-65-1", "5436-43-1", "126-73-8") #' ACC <- get_ACC(CAS) #' nrow(ACC) +#' +#' # See available flags and associated ids: +#' +#' flags +#' #' ACC <- remove_flags(ACC) #' nrow(ACC) -remove_flags <- function(ACC, flagsShort = c( - "Borderline", - "OnlyHighest", - "GainAC50", - "Biochemical", - "ACCLessThan" - )) { - match.arg(flagsShort, - c( - "Borderline", - "OnlyHighest", - "OneAbove", - "Noisy", - "HitCall", - "GainAC50", - "Biochemical", - "LessThan50", - "ACCLessThan", - "GNLSmodel" - ), +remove_flags <- function(ACC, + flag_id = c(5, 6, 11, 15, 18)) { + match.arg(as.character(flag_id), + as.character(unique(flags$flag_id)), several.ok = TRUE ) - flag_hits <- dplyr::select(ACC, flags) %>% - dplyr::mutate( - Borderline = grepl("Borderline active", flags), - Noisy = grepl("Noisy data", flags), - OneAbove = grepl("Only one conc above baseline", flags), - OnlyHighest = grepl("Only highest conc above baseline", flags), - Biochemical = grepl("Biochemical assay with", flags), - GainAC50 = grepl("Gain AC50", flags), - HitCall = grepl("potentially confounded by overfitting", flags), - LessThan50 = grepl("Less than 50% efficacy", flags), - ACCLessThan = grepl("AC50 less than lowest concentration tested", flags), - GNLSmodel = grepl("Cell viability assay fit with gnls winning model", flags) - ) %>% - dplyr::select(-flags) - - ACC <- ACC[rowSums(flag_hits[flagsShort]) == 0, ] - - return(ACC) - - # So, with the defaults, we are taking out: - # c("Borderline active", - # "Only highest conc above baseline, active", - # "Gain AC50 < lowest conc & loss AC50 < mean conc", - # "Biochemical assay with < 50% efficacy") - # We are leaving in with the defaults: - # c("Hit-call potentially confounded by overfitting", - # "Only one conc above baseline, active", - # "AC50 less than lowest concentration tested", - # "Less than 50% efficacy", - # "Noisy data","Cell viability assay fit with gnls winning model") + remove_rows <- which(colSums(sapply(ACC$flags, "%in%", x = flag_id)) > 0) + + ACC_filter <- ACC[-remove_rows, ] + + return(ACC_filter) + + } diff --git a/R/plot_tox_stacks.R b/R/plot_tox_stacks.R index b0dfe769..ca4886a5 100644 --- a/R/plot_tox_stacks.R +++ b/R/plot_tox_stacks.R @@ -55,7 +55,8 @@ #' filtered_ep <- filter_groups(cleaned_ep) #' chemical_summary <- get_chemical_summary(tox_list, ACC, filtered_ep) #' -#' plot_tox_stacks(chemical_summary, tox_list$chem_site, "Biological") +#' plot_tox_stacks(chemical_summary, tox_list$chem_site, +#' "Biological", top_num = 5) #' #' \donttest{ #' plot_tox_stacks(chemical_summary, tox_list$chem_site, "Chemical Class") diff --git a/R/sysdata.rda b/R/sysdata.rda index 120024e1..ddd18a2d 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/toxEval.R b/R/toxEval.R index 2900c51d..b2ec0f5d 100644 --- a/R/toxEval.R +++ b/R/toxEval.R @@ -10,7 +10,7 @@ ToxCast database: version", dbVersion()), width = 40), } dbVersion <- function() { - "3.5" + "4.1" } #' Analyze ToxCast data in relation to measured concentrations. @@ -45,19 +45,15 @@ dbVersion <- function() { #' @keywords internal "_PACKAGE" -#' ACC values included with toxEval. +#' ACC values included with toxEval. See \code{vignette("Setting up toxEval package data", package = "toxEval")} +#' for more information on how the data was aggregated. #' -#' Downloaded on October 2022 from ToxCast. The data were -#' combined from files in the "INVITRODB_V3_5_LEVEL5" folder. -#' At the time of toxEval package release, this information was found: -#' \url{https://www.epa.gov/comptox-tools/exploring-toxcast-data} -#' in the "ToxCast & Tox21 Data Spreadsheet" data set. -#' ACC values are the in the "ACC" column (winning model) and units are -#' log micro-Molarity (log \eqn{\mu}M). +#' Downloaded on September 2023 from ToxCast. See also +#' \url{https://www.frontiersin.org/articles/10.3389/ftox.2023.1275980/full}. #' -#' @references Toxicology, EPA's National Center for Computational (2020): -#' ToxCast and Tox21 Data Spreadsheet. figshare. Dataset. -#' \doi{10.23645/epacomptox.6062479.v3}. +#' @references U.S. EPA. 2023. ToxCast & Tox21 Summary Files. +#' Retrieved from \url{https://www.epa.gov/chemical-research/toxicity-forecaster-toxcasttm-data} +#' on September 2023. #' #' @source \url{https://www.epa.gov/comptox-tools/exploring-toxcast-data} #' @@ -73,13 +69,8 @@ NULL #' Endpoint information from ToxCast #' -#' Downloaded on October 2022 from ToxCast. The file name of the -#' raw data was "assay_annotation_information_invitrodb_v3_5.xlsx" from the zip file -#' "INVITRODB_V3_5_SUMMARY" folder. At the time -#' of the toxEval package release, these data were found at: -#' \url{https://www.epa.gov/comptox-tools/exploring-toxcast-data} -#' in the section marked "Download Assay Information", in the -#' ToxCast & Tox21 high-throughput assay information data set. +#' See \code{vignette("Setting up toxEval package data", package = "toxEval")} +#' for more information on how the data was aggregated. #' #' #' @name end_point_info @@ -99,45 +90,15 @@ NULL #' head(end_point_info[, 1:5]) NULL -#Due to size constraints for CRAN, some columns needed to be removed: -# -# end_point_info <- end_point_info |> -# dplyr::select(-reagent_reagent_name_value_type, -# -reagent_reagent_name_value, -# -citations_citation, -# -citations_title, -# -citations_author, -# -assay_source_desc, -# -assay_component_endpoint_desc) -# save(end_point_info, tox_chemicals, ToxCast_ACC, file = "sysdata.rda", compress = "xz") #' ToxCast Chemical Information #' -#' Downloaded from the CompTox database on October 2022. -#' \url{https://comptox.epa.gov/dashboard/}. Additional columns were -#' added based on the information from the "INVITRODB_V3_5_LEVEL5" data. +#' See \code{vignette("Setting up toxEval package data", package = "toxEval")} +#' for more information on how the data was aggregated. #' -#' @return data frame with the following columns: -#' \tabular{ll}{ -#' Column \tab Description \cr -#' DSSTox_Substance_Id \tab DSSTox_Substance_Id\cr -#' Substance_Name \tab Commen chemical name \cr -#' Structure_MolWt \tab Molecular weight \cr -#' DTXCID \tab DTXCID\cr -#' Substance_CASRN \tab CASRN \cr -#' INCHIKEY \tab INCHIKEY\cr -#' SMILES \tab SMILES\cr -#' Total_tested \tab Total number of ToxCast assays tested\cr -#' Active \tab Number of ToxCast assays flagged as active \cr -#' min_concentration \tab Minimum concentration tested in ToxCast (ug/L) \cr -#' max_concentration \tab Maximum concentration tested in ToxCast (ug/L) \cr -#' } -#' #' @aliases tox_chemicals #' @name tox_chemicals -#' @return data frame with columns: -#' "Substance_Name","Substance_CASRN", -#' "Structure_MolWt" +#' @return data frame #' @docType data #' @keywords datasets #' @export tox_chemicals @@ -145,7 +106,20 @@ NULL #' head(tox_chemicals) NULL - +#' ToxCast Chemical Information +#' +#' See \code{vignette("Setting up toxEval package data", package = "toxEval")} +#' for more information on how the data was aggregated. +#' +#' @aliases flags +#' @name flags +#' @return data frame +#' @docType data +#' @keywords datasets +#' @export flags +#' @examples +#' head(flags) +NULL utils::globalVariables(c("CAS", "endPoint", "chnm", "flags", "site", "Bio_category", "Class", "EAR", @@ -159,5 +133,6 @@ utils::globalVariables(c("CAS", "endPoint", "chnm", "flags", "site", "dec_lat", "dec_lon", "nSites", "name", "nonZero", "maxEAR", "count", "site_grouping", "index", "n", "x", "y", "max_med", "ymin", "label", - "ymax", "hit_label", "percentDet", "lab")) + "ymax", "hit_label", "percentDet", "lab", + "aeid", "assay_component_endpoint_name", "casn", "hit_val")) diff --git a/README.Rmd b/README.Rmd index 692c82fe..8a1fab90 100644 --- a/README.Rmd +++ b/README.Rmd @@ -40,21 +40,26 @@ install_gitlab("water/toxEval", host = "code.usgs.gov", build_vignettes = TRUE, build_opts = c("--no-resave-data", - "--no-manual")) + "--no-manual"), + dependencies = TRUE) ``` ## Quickstart -

- app_demo -

- Installation instructions are below. To quickly get going in `toxEval`, run: -```{r eval=FALSE} +```{r} library(toxEval) +``` + +```{r eval=FALSE} explore_endpoints() ``` + +

+ app_demo +

+ Then click on the "Load Example Data" in the upper right corner. This loads the example data that is found here: ```{r eval=FALSE} @@ -125,7 +130,7 @@ The Water and Environmental Health Mission Areas of the USGS, as well as the Gr ### Sunset date -Funding for `toxEval` is secured through summer 2024, after which bug fixes & new features will be minimal. +Funding for `toxEval` is secured through summer 2025, after which bug fixes & new features will be minimal. ## Run toxEval diff --git a/README.md b/README.md index 7a4110c9..92d6d564 100644 --- a/README.md +++ b/README.md @@ -45,23 +45,30 @@ install_gitlab("water/toxEval", host = "code.usgs.gov", build_vignettes = TRUE, build_opts = c("--no-resave-data", - "--no-manual")) + "--no-manual"), + dependencies = TRUE) ``` ## Quickstart -

-app_demo -

- Installation instructions are below. To quickly get going in `toxEval`, run: ``` r library(toxEval) +#> For more information: +#> https://doi-usgs.github.io/toxEval/ +#> ToxCast database: version 4.1 +``` + +``` r explore_endpoints() ``` +

+app_demo +

+ Then click on the “Load Example Data” in the upper right corner. This loads the example data that is found here: @@ -78,9 +85,6 @@ data provided in the package): ``` r library(toxEval) -#> For more information: -#> https://doi-usgs.github.io/toxEval/ -#> ToxCast database: version 3.5 path_to_file <- file.path(system.file("extdata", package="toxEval"), "OWC_data_fromSup.xlsx") tox_list <- create_toxEval(path_to_file) ACClong <- get_ACC(tox_list$chem_info$CAS) @@ -100,7 +104,7 @@ chem_class_plot <- plot_tox_boxplots(chemicalSummary, chem_class_plot ``` -![](man/figures/README-unnamed-chunk-6-1.png) +![](man/figures/README-unnamed-chunk-7-1.png) ``` r @@ -111,7 +115,7 @@ plot_stacks <- plot_tox_stacks(chemicalSummary, plot_stacks ``` -![](man/figures/README-unnamed-chunk-6-2.png) +![](man/figures/README-unnamed-chunk-7-2.png) ``` r ###################################### @@ -122,7 +126,7 @@ plot_heat <- plot_tox_heatmap(chemicalSummary, plot_heat ``` -![](man/figures/README-unnamed-chunk-6-3.png) +![](man/figures/README-unnamed-chunk-7-3.png) This code opens up the example file, loads it into a `toxEval` object, grabs the pertinent ToxCast information, and creates a “chemicalSummary” @@ -131,12 +135,12 @@ data frame that is used in many of the plot and table functions. There are 4 vignettes to help introduce and navigate the `toxEval` package: -| Name | R command | Description | +| Name | R command | Description | |------------|--------------|----------------------------------------------| -| [Introduction](https://rconnect.usgs.gov/toxEval_docs/articles/Introduction.html) | `vignette("Introduction", package="toxEval")` | Introduction to the toxEval | +| [Introduction](https://rconnect.usgs.gov/toxEval_docs/articles/Introduction.html) | `vignette("Introduction", package="toxEval")` | Introduction to the toxEval | | [Basic Workflow](https://rconnect.usgs.gov/toxEval_docs/articles/basicWorkflow.html) | `vignette("basicWorkflow", package="toxEval")` | Quickstart guide to get overview of available functions | -| [Prepare Data](https://rconnect.usgs.gov/toxEval_docs/articles/PrepareData.html) | `vignette("PrepareData", package="toxEval")` | Guide to preparing your data for toxEval analysis | -| [Shiny App Guide](https://rconnect.usgs.gov/toxEval_docs/articles/shinyApp.html) | `vignette("shinyApp", package="toxEval")` | Guide to the toxEval shiny application | +| [Prepare Data](https://rconnect.usgs.gov/toxEval_docs/articles/PrepareData.html) | `vignette("PrepareData", package="toxEval")` | Guide to preparing your data for toxEval analysis | +| [Shiny App Guide](https://rconnect.usgs.gov/toxEval_docs/articles/shinyApp.html) | `vignette("shinyApp", package="toxEval")` | Guide to the toxEval shiny application | ### Reporting bugs @@ -162,7 +166,7 @@ team. ### Sunset date -Funding for `toxEval` is secured through summer 2024, after which bug +Funding for `toxEval` is secured through summer 2025, after which bug fixes & new features will be minimal. ## Run toxEval @@ -181,25 +185,22 @@ explore_endpoints() ``` r citation(package = "toxEval") -#> To cite toxEval in publications, please use: +#> To cite package 'toxEval' in publications use: #> -#> De Cicco, L.A., Corsi, S.R., Villeneuve D.L, Blackwell, and B.R, -#> Ankley, G.T., 2023, toxEval: Evaluation of measured concentration -#> data using the ToxCast high-throughput screening database or a -#> user-defined set of concentration benchmarks. R package version -#> 1.3.0., https://code.usgs.gov/water/toxEval, doi:10.5066/P906UQ5I +#> DeCicco L, Corsi S, Villeneuve D, Blackwell B, Ankley G (2024). +#> _toxEval: Exploring Biological Relevance of Environmental Chemistry +#> Observations_. R package version 1.4.0, commit +#> a37e823cef5c31903dad50537e1a517953ef505a, +#> . #> #> A BibTeX entry for LaTeX users is #> #> @Manual{, -#> author = {Laura A. {De Cicco} and Steven R. Corsi and Daniel L. Villeneuve and Brett R. Blackwell and Gerald T. Ankley}, -#> title = {toxEval: Evaluation of measured concentration data using the ToxCast high-throughput screening database or a user-defined set of concentration benchmarks.}, -#> publisher = {U.S. Geological Survey}, -#> version = {1.3.0}, -#> address = {Reston, VA}, -#> institution = {U.S. Geological Survey}, -#> year = {2023}, -#> doi = {10.5066/P906UQ5I}, +#> title = {toxEval: Exploring Biological Relevance of Environmental Chemistry +#> Observations}, +#> author = {Laura DeCicco and Steven Corsi and Daniel Villeneuve and Brett Blackwell and Gerald Ankley}, +#> year = {2024}, +#> note = {R package version 1.4.0, commit a37e823cef5c31903dad50537e1a517953ef505a}, #> url = {https://code.usgs.gov/water/toxEval}, #> } ``` diff --git a/_pkgdown.yml b/_pkgdown.yml index 31a5127c..84757431 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -55,6 +55,7 @@ reference: - end_point_info - ToxCast_ACC - tox_chemicals + - flags - title: Clean Data desc: Functions to prepare data for analysis contents: diff --git a/deploy_simple.R b/deploy_simple.R index 857f3a71..92075f07 100644 --- a/deploy_simple.R +++ b/deploy_simple.R @@ -2,12 +2,11 @@ library(connectapi) client <- connect(server = Sys.getenv("CONNECT_SERVER"), api_key = Sys.getenv("CONNECT_API_KEY")) -file.copy(from = "./public/articles/logo.png", - to = "./public/reference/logo.png") +install.packages("toxEval", repos = "https://rpkg.chs.usgs.gov/prod-cran/latest") -rsconnect::writeManifest(appDir = "./public") -bundle <- bundle_dir("./public") +rsconnect::writeManifest(appDir = "./inst/shiny") +bundle <- bundle_dir("./inst/shiny") content <- client %>% - deploy(bundle, name = "toxEval_docs") %>% + deploy(bundle, name = "toxEval") %>% poll_task() diff --git a/inst/CITATION b/inst/CITATION index eeedf66c..ac7e05db 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -9,11 +9,11 @@ bibentry(bibtype = "Manual", as.person("Gerald T. Ankley")), title = "toxEval: Evaluation of measured concentration data using the ToxCast high-throughput screening database or a user-defined set of concentration benchmarks.", publisher = "U.S. Geological Survey", - version = "1.3.2", + version = "1.4.0", address="Reston, VA", institution = "U.S. Geological Survey", year = 2024, - doi = "10.5066/P906UQ5I", + doi = "10.5066/P1CQJHJV", url = "https://code.usgs.gov/water/toxEval", - textVersion = "De Cicco, L.A., Corsi, S.R., Villeneuve D.L, Blackwell, and B.R, Ankley, G.T., 2024, toxEval: Evaluation of measured concentration data using the ToxCast high-throughput screening database or a user-defined set of concentration benchmarks. R package version 1.3.1., https://code.usgs.gov/water/toxEval, doi:10.5066/P906UQ5I" + textVersion = "De Cicco, L.A., Corsi, S.R., Villeneuve D.L, Blackwell, and B.R, Ankley, G.T., 2024, toxEval: Evaluation of measured concentration data using the ToxCast high-throughput screening database or a user-defined set of concentration benchmarks. R package version 1.4.0., U.S. Geological Survey software release. Reston, VA., doi:10.5066/P1CQJHJV" ) diff --git a/inst/shiny/benchmarks.R b/inst/shiny/benchmarks.R index 19f467f4..8aa852c6 100644 --- a/inst/shiny/benchmarks.R +++ b/inst/shiny/benchmarks.R @@ -21,20 +21,24 @@ get_benchmarks <- reactive({ if(all(is.null(rawData$benchmarks))) { ACC <- get_ACC(rawData$chem_info$CAS) - ACC <- remove_flags(ACC, flagsShort = removeFlags) + ACC <- remove_flags(ACC, flag_id = removeFlags) remove_groups <- unique(cleaned_ep[[groupCol]])[which(!unique(cleaned_ep[[groupCol]]) %in% groups)] remove_groups <- remove_groups[!is.na(remove_groups)] filtered_ep <- filter_groups(cleaned_ep, - groupCol = groupCol, assays = assays, + groupCol = groupCol, + remove_assays = assays, remove_groups = remove_groups) bench <- ACC %>% dplyr::filter(endPoint %in% filtered_ep$endPoint) %>% dplyr::rename(Value = ACC_value, Chemical = chnm) %>% - dplyr::left_join(filtered_ep, by = "endPoint") + dplyr::left_join(filtered_ep, by = "endPoint") |> + dplyr::group_by(dplyr::across(c(-flags))) |> + dplyr::summarise(flags = paste0(unlist(flags), collapse = "|")) |> + dplyr::ungroup() } else { bench <- rawData$benchmarks diff --git a/inst/shiny/boxPlot.R b/inst/shiny/boxPlot.R index 6f200968..2aaaf6e7 100644 --- a/inst/shiny/boxPlot.R +++ b/inst/shiny/boxPlot.R @@ -86,6 +86,7 @@ output$downloadBoxPlot <- downloadHandler( content = function(file) { ggplot2::ggsave(file, plot = boxPlot_prints(), device = "png", width = 11, + bg = "white", height = PlotHeight() / 200) } ) diff --git a/inst/shiny/endpointGraph.R b/inst/shiny/endpointGraph.R index 3f2163a6..2a62d25a 100644 --- a/inst/shiny/endpointGraph.R +++ b/inst/shiny/endpointGraph.R @@ -14,6 +14,13 @@ endpointGraph_create <- reactive({ need(top_num <= 50 , "Shiny app cannot display more than 50 endpoints") ) + height <- PlotHeight_ep() + if(height > 750) { + text_size <- 10 + } else { + text_size <- NA + } + endpointGraph <- plot_tox_endpoints(chemical_summary, category = category, mean_logic = mean_logic, @@ -21,7 +28,7 @@ endpointGraph_create <- reactive({ hit_threshold = hitThres, filterBy = filterBy, title = genericTitle(), - font_size = 18, + font_size = text_size, top_num = top_num) shinyAce::updateAceEditor(session, editorId = "epGraph_out", value = epGraphCode() ) @@ -60,15 +67,15 @@ PlotHeight_ep = reactive({ } if(is.na(top_num)){ - n <- 35*length(unique(chemical_summary$endPoint)) + n_eps <- 35*length(unique(chemical_summary$endPoint)) } else { - n <- 35*top_num + n_eps <- 35*top_num } - if(n < 500){ - return(500) + if(n_eps < 750){ + return(750) } else { - return(n) + return(n_eps) } }) @@ -80,7 +87,8 @@ output$downloadEndpoint <- downloadHandler( content = function(file) { ggplot2::ggsave(file, plot = endpointGraph_create(), device = "png", width = 11, - height = PlotHeight_ep()/300) + bg = "white", + height = PlotHeight_ep()/ 200) } ) diff --git a/inst/shiny/heatMap.R b/inst/shiny/heatMap.R index aaf015ef..853fdc9a 100644 --- a/inst/shiny/heatMap.R +++ b/inst/shiny/heatMap.R @@ -63,6 +63,7 @@ output$downloadHeatPlot <- downloadHandler( content = function(file) { ggplot2::ggsave(file, plot = heatMap_create(), device = "png", width = 11, + bg = "white", height = PlotHeight()/200) } ) diff --git a/inst/shiny/server.R b/inst/shiny/server.R index 0db6a718..8d3df389 100644 --- a/inst/shiny/server.R +++ b/inst/shiny/server.R @@ -12,37 +12,15 @@ choicesPerGroup <- choicesPerGroup[which(as.numeric(choicesPerGroup) > 6)] choicesPerGroup <- apply(cleaned_ep[,-1], 2, function(x) length(unique(x))) groupChoices <- paste0(names(choicesPerGroup)," (",choicesPerGroup,")") -initAssay <- c("ACEA", "APR", "ATG", - "NVS", "OT", - "TOX21", "CEETOX", "CLD", "TANGUAY", "NHEERL_PADILLA", "NCCT", - "NHEERL_HUNTER", "NHEERL_NIS", "NHEERL_MED", "UPITT") +initAssay <- c("BSK") init_Groups <- unique(cleaned_ep$intended_target_family) init_Groups <- init_Groups[!is.na(init_Groups)] init_Groups <- init_Groups[!(init_Groups %in% c("Background Measurement","Undefined"))] -all_flags <- c("Borderline", - "OnlyHighest", - "OneAbove", - "Noisy", - "HitCall", - "GainAC50", - "Biochemical", - "LessThan50", - "ACCLessThan", - "GNLSmodel") - -initFlags <- c("Borderline", - "OnlyHighest", - #"OneAbove", - #"Noisy", - #"HitCall", - "GainAC50", - "Biochemical", - #"LessThan50", - "ACCLessThan" - #"GNLSmodel" - ) +all_flags <- flags$flag_id + +initFlags <- c(5, 6, 11, 15, 18) sitesOrdered <- c("StLouis","Pigeon","Nemadji","WhiteWI","Bad","Montreal","PresqueIsle", "Ontonagon","Sturgeon","Tahquamenon", @@ -102,12 +80,12 @@ tox_list <- create_toxEval(path_to_file)") setupCode <- paste0(setupCode," ACC <- get_ACC(tox_list$chem_info$CAS) ACC <- remove_flags(ACC = ACC, - flagsShort = ",removeFlags,") + flag_id = ",removeFlags,") cleaned_ep <- clean_endPoint_info(end_point_info) filtered_ep <- filter_groups(cleaned_ep, groupCol = '",groupCol,"', - assays = ",assays,", + remove_assays = ",assays,", remove_groups = ",remove_groups,") chemical_summary <- get_chemical_summary(tox_list, ACC, filtered_ep)") @@ -152,16 +130,19 @@ chemical_summary <- chemical_summary[chemical_summary$shortName == site,]") } - if(all(is.null(rawData$benchmarks)) || nrow(rawData$benchmarks) == 0){ + if(all(is.null(rawData$benchmarks)) || + nrow(rawData$benchmarks) == 0 || + as.logical(input$useToxCast)){ ACC <- get_ACC(rawData$chem_info$CAS) - ACC <- remove_flags(ACC, flagsShort = removeFlags) + ACC <- remove_flags(ACC, flag_id = removeFlags) remove_groups <- unique(cleaned_ep[[groupCol]])[which(!unique(cleaned_ep[[groupCol]]) %in% groups)] remove_groups <- remove_groups[!is.na(remove_groups)] filtered_ep <- filter_groups(cleaned_ep, - groupCol = groupCol, assays = assays, + groupCol = groupCol, + remove_assays = assays, remove_groups = remove_groups) chemical_summary <- get_chemical_summary(rawData, ACC, @@ -196,14 +177,26 @@ chemical_summary <- chemical_summary[chemical_summary$shortName == site,]") toxCast <- reactive({ rawData <- rawData() - - toxCast_val <- all(is.null(rawData$benchmarks)) + + if(all(is.null(rawData$benchmarks))){ + toxCast_val <- TRUE + } else { + toxCast_val <- as.logical(input$useToxCast) + } + return(toxCast_val) }) + hasBenchmarks <- reactive({ + rawData <- rawData() + return(!all(is.null(rawData$benchmarks))) + }) + output$isTox <- reactive(toxCast()) + output$hasBenchmarks <- reactive(hasBenchmarks()) outputOptions(output, "isTox", suspendWhenHidden = FALSE) + outputOptions(output, "hasBenchmarks", suspendWhenHidden = FALSE) output$title_text <- renderText({ @@ -311,23 +304,23 @@ chemical_summary <- chemical_summary[chemical_summary$shortName == site,]") } else { pretty_cat <- switch(category, "Chemical" = "", - "Biological" = "for chemicals within a grouping ", - "Chemical Class" = "for chemicals within a class " + "Biological" = "for chemicals within a grouping", + "Chemical Class" = "for chemicals within a class" ) } if(site == "All"){ if(sum_logic){ - title <- paste0("Summing EARs ",pretty_cat, "of a sample,") + title <- paste("Summing EARs",pretty_cat, "of a sample,") } else { title <- paste("Max EARs",pretty_cat, "of a sample,") } if(mean_logic){ - title <- paste(title,"taking the mean of each site") + title <- paste(title,"\ntaking the mean of each site") } else { - title <- paste(title,"taking the max of each site") + title <- paste(title,"\ntaking the max of each site") } } else { @@ -352,8 +345,7 @@ chemical_summary <- chemical_summary[chemical_summary$shortName == site,]") title <- paste(title, "for individual samples") } - title <- paste(title," - ", siteTable[["Fullname"]][which(siteTable$`Short Name` == site)]) + title <- paste(title,"\n", siteTable[["Fullname"]][which(siteTable$`Short Name` == site)]) } return(title) diff --git a/inst/shiny/stackPlot.R b/inst/shiny/stackPlot.R index 602141f0..0a006506 100644 --- a/inst/shiny/stackPlot.R +++ b/inst/shiny/stackPlot.R @@ -61,6 +61,7 @@ output$downloadStackPlot <- downloadHandler( content = function(file) { ggplot2::ggsave(file, plot = stackBarGroup_create(), device = "png", width = 11, + bg = "white", height = 9) } ) diff --git a/inst/shiny/ui.R b/inst/shiny/ui.R index 0a57d901..93dfb005 100644 --- a/inst/shiny/ui.R +++ b/inst/shiny/ui.R @@ -31,27 +31,9 @@ dropDownHeader <- c(paste0(df$orderNames," (",df$nEndPoints,")")) selChoices <- df$orderNames -flagsALL <- c("Borderline active", - "Only highest conc above baseline, active" , - "Only one conc above baseline, active", - "Noisy data", - "Hit-call potentially confounded by overfitting", - "Gain AC50 < lowest conc & loss AC50 < mean conc", - "Biochemical assay with < 50% efficacy", - "Less than 50% efficacy", - "AC50 less than lowest concentration tested", - "Cell viability assay fit with gnls winning model") +flagsALL <- flags$flag_full -shortFlags <- c("Borderline", - "OnlyHighest", - "OneAbove", - "Noisy", - "HitCall", - "GainAC50", - "Biochemical", - "LessThan50", - "ACCLessThan", - "GNLSmodel") +shortFlags <- flags$flag_id assay_names <- c("Apredica" = "APR", "Attagene" = "ATG", @@ -92,14 +74,16 @@ sidebar <- dashboardSidebar( radioButtons("sumEAR", choices = list("Sum"=TRUE, "No Sum" = FALSE), inline = TRUE, label = NULL, selected = TRUE) ), + conditionalPanel( + condition = "output.hasBenchmarks == true", + radioButtons("useToxCast", choices = list("ToxCast"=TRUE, "Benchmarks" = FALSE), + inline = TRUE, label = NULL, selected = FALSE) + ), downloadButton('downloadBenchmarks', 'Download Benchmarks', style='margin-left:13px; color: #444'), menuItem("Assay", icon = icon("th"), tabName = "assay", - checkboxGroupInput("assay", "Assays:", + checkboxGroupInput("assay", "Remove Assays:", assay_names, - selected= c("ACEA", "APR", "ATG", - "NVS", "OT", - "TOX21", "CEETOX", "CLD", "TANGUAY", "NHEERL_PADILLA", "NCCT", - "NHEERL_HUNTER", "NHEERL_NIS", "NHEERL_MED", "UPITT")), + selected= c("BSK")), actionButton("pickAssay", label="Switch Assays")), menuItem("Annotation", icon = icon("th"), tabName = "annotation", selectInput("groupCol", label = "Annotation (# Groups)", @@ -119,11 +103,8 @@ sidebar <- dashboardSidebar( selected = "All", multiple = FALSE) ), menuItem("Flags", icon = icon("th"), tabName = "flagMenu", - checkboxGroupInput("flags", "Include Flags",choices = shortFlags, selected = c("Borderline", - "OnlyHighest", - "GainAC50", - "Biochemical", - "ACCLessThan")), + checkboxGroupInput("flags", "Remove Flags",choices = shortFlags, + selected = c(5, 6, 11, 15, 18)), actionButton("pickFlags", label="Switch flags")), menuItem("Hit Threshold",icon = icon("th"), tabName = "hitThresTab", numericInput("hitThres",label = "Hit Threshold",value = 0.1,min = 0.0000001), diff --git a/man/ToxCast_ACC.Rd b/man/ToxCast_ACC.Rd index b971126b..d5fa4841 100644 --- a/man/ToxCast_ACC.Rd +++ b/man/ToxCast_ACC.Rd @@ -3,7 +3,8 @@ \docType{data} \name{ToxCast_ACC} \alias{ToxCast_ACC} -\title{ACC values included with toxEval.} +\title{ACC values included with toxEval. See \code{vignette("Setting up toxEval package data", package = "toxEval")} +for more information on how the data was aggregated.} \source{ \url{https://www.epa.gov/comptox-tools/exploring-toxcast-data} } @@ -11,20 +12,15 @@ data frame with columns CAS, chnm (chemical name), flags, endPoint, and ACC (value). } \description{ -Downloaded on October 2022 from ToxCast. The data were -combined from files in the "INVITRODB_V3_5_LEVEL5" folder. -At the time of toxEval package release, this information was found: -\url{https://www.epa.gov/comptox-tools/exploring-toxcast-data} -in the "ToxCast & Tox21 Data Spreadsheet" data set. -ACC values are the in the "ACC" column (winning model) and units are -log micro-Molarity (log \eqn{\mu}M). +Downloaded on September 2023 from ToxCast. See also +\url{https://www.frontiersin.org/articles/10.3389/ftox.2023.1275980/full}. } \examples{ head(ToxCast_ACC) } \references{ -Toxicology, EPA's National Center for Computational (2020): -ToxCast and Tox21 Data Spreadsheet. figshare. Dataset. - \doi{10.23645/epacomptox.6062479.v3}. +U.S. EPA. 2023. ToxCast & Tox21 Summary Files. +Retrieved from \url{https://www.epa.gov/chemical-research/toxicity-forecaster-toxcasttm-data} +on September 2023. } \keyword{datasets} diff --git a/man/clean_endPoint_info.Rd b/man/clean_endPoint_info.Rd index eacdf7a6..fed25be8 100644 --- a/man/clean_endPoint_info.Rd +++ b/man/clean_endPoint_info.Rd @@ -16,14 +16,8 @@ intended_target_family_sub. The names in intended_target_family are revised to look more appealing in graphs and tables. } \description{ -Define a subset of the ToxCast database for relevance to toxEval analyses. -Subsetting is done based upon methods defined by Blackwell et al., 2017 ( -\doi{10.1021/acs.est.7b01613}). -Specifically, this function removes endPoints that are ATG sources with -signal loss, and NVS with signal gain (basically: some assay/signal combinations -are removed because they target non-specific endpoints). Also, this function adds additional -categories to intended_target_family and intended_target_family_sub as -described in the paper linked above. +As of ToxCast 4.1, this function only helps clean up abbrieviations +found in the end_point_info data frame. } \examples{ end_point_info <- end_point_info diff --git a/man/end_point_info.Rd b/man/end_point_info.Rd index b55ae291..62604efd 100644 --- a/man/end_point_info.Rd +++ b/man/end_point_info.Rd @@ -14,13 +14,8 @@ The column "Relevance Category" was included for consideration of grouping/filtering endpoints based on user goals. } \description{ -Downloaded on October 2022 from ToxCast. The file name of the -raw data was "assay_annotation_information_invitrodb_v3_5.xlsx" from the zip file -"INVITRODB_V3_5_SUMMARY" folder. At the time -of the toxEval package release, these data were found at: -\url{https://www.epa.gov/comptox-tools/exploring-toxcast-data} -in the section marked "Download Assay Information", in the -ToxCast & Tox21 high-throughput assay information data set. +See \code{vignette("Setting up toxEval package data", package = "toxEval")} +for more information on how the data was aggregated. } \examples{ end_point_info <- end_point_info diff --git a/man/figures/README-unnamed-chunk-6-1.png b/man/figures/README-unnamed-chunk-6-1.png deleted file mode 100644 index 2ddb71e4..00000000 Binary files a/man/figures/README-unnamed-chunk-6-1.png and /dev/null differ diff --git a/man/figures/README-unnamed-chunk-6-2.png b/man/figures/README-unnamed-chunk-6-2.png deleted file mode 100644 index d43ab1ac..00000000 Binary files a/man/figures/README-unnamed-chunk-6-2.png and /dev/null differ diff --git a/man/figures/README-unnamed-chunk-6-3.png b/man/figures/README-unnamed-chunk-6-3.png deleted file mode 100644 index 22d5a6a7..00000000 Binary files a/man/figures/README-unnamed-chunk-6-3.png and /dev/null differ diff --git a/man/figures/README-unnamed-chunk-7-1.png b/man/figures/README-unnamed-chunk-7-1.png new file mode 100644 index 00000000..cf6bc624 Binary files /dev/null and b/man/figures/README-unnamed-chunk-7-1.png differ diff --git a/man/figures/README-unnamed-chunk-7-2.png b/man/figures/README-unnamed-chunk-7-2.png new file mode 100644 index 00000000..a718ae7b Binary files /dev/null and b/man/figures/README-unnamed-chunk-7-2.png differ diff --git a/man/figures/README-unnamed-chunk-7-3.png b/man/figures/README-unnamed-chunk-7-3.png new file mode 100644 index 00000000..aaf5eb06 Binary files /dev/null and b/man/figures/README-unnamed-chunk-7-3.png differ diff --git a/man/filter_groups.Rd b/man/filter_groups.Rd index 6eddf4f4..30c65b69 100644 --- a/man/filter_groups.Rd +++ b/man/filter_groups.Rd @@ -7,9 +7,7 @@ filter_groups( ep, groupCol = "intended_target_family", - assays = c("ACEA", "APR", "ATG", "NVS", "OT", "TOX21", "CEETOX", "LTEA", "CLD", - "TANGUAY", "CCTE_PADILLA", "CCTE", "STM", "ARUNA", "CCTE_SHAFER", "CPHEA_STOKER", - "CCTE_GLTED", "UPITT", "UKN", "ERF", "TAMU", "IUF", "CCTE_MUNDY", "UTOR", "VALA"), + remove_assays = c("BSK"), remove_groups = c("Background Measurement", "Undefined") ) } @@ -18,11 +16,8 @@ filter_groups( \item{groupCol}{Character name of ToxCast annotation column to use as a group category} -\item{assays}{Vector of assays to use in the data analysis. Possible values are "ACEA", "APR", "ATG", -"NVS", "OT", "TOX21", "CEETOX", "LTEA", "CLD", "TANGUAY", "CCTE_PADILLA", "BSK" , -"CCTE", "STM", "ARUNA", "CCTE_SHAFER", "CPHEA_STOKER", "CCTE_GLTED", "UPITT", "UKN", -"ERF", "TAMU", "IUF", "CCTE_MUNDY", "UTOR", "VALA". By default, the -"BSK" (BioSeek) assay is removed.} +\item{remove_assays}{Vector of assays to EXCLUDE in the data analysis. +By default, the "BSK" (BioSeek) assay is removed.} \item{remove_groups}{Vector of groups within the selected 'groupCol' to remove.} } @@ -31,7 +26,7 @@ This function provides a mechanism to specify 3 levels of information in the supplied data frame \code{\link{end_point_info}} to be used in subsequent analysis steps. First, the user specifies the ToxCast assay annotation using the 'groupCol' argument, which is a column header in 'end_point_info'. Second, the user -specifies the families of assays to use. Finally, the user can choose to +specifies the families of assays to exclude. Finally, the user can choose to remove specific group(s) from the category. The default is to remove 'Background Measurement' and 'Undefined'. Choices for this should be reconsidered based on individual study objectives. diff --git a/man/flags.Rd b/man/flags.Rd new file mode 100644 index 00000000..08888eba --- /dev/null +++ b/man/flags.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/toxEval.R +\docType{data} +\name{flags} +\alias{flags} +\title{ToxCast Chemical Information} +\value{ +data frame +} +\description{ +See \code{vignette("Setting up toxEval package data", package = "toxEval")} +for more information on how the data was aggregated. +} +\examples{ +head(flags) +} +\keyword{datasets} diff --git a/man/get_ACC.Rd b/man/get_ACC.Rd index 1bbf3d72..1551adfa 100644 --- a/man/get_ACC.Rd +++ b/man/get_ACC.Rd @@ -17,10 +17,8 @@ The \code{get_ACC} function retrieves the activity concentration at cutoff (ACC) values for specified chemicals. } \details{ -The data used in toxEval were combined from files in the -"INVITRODB_V3_LEVEL5" directory that were included in the October 2018 -release of the ToxCast database. The function \code{get_ACC} will -convert the ACC values in the ToxCast database from units of (log \eqn{\mu}M) +The function \code{get_ACC} will +convert the ACC values in the ToxCast database from units of (\eqn{\mu}M) to units of \eqn{\mu}g/L, and reformat the data as input to toxEval. } \examples{ diff --git a/man/plot_tox_stacks.Rd b/man/plot_tox_stacks.Rd index fb668aec..145b5b9f 100644 --- a/man/plot_tox_stacks.Rd +++ b/man/plot_tox_stacks.Rd @@ -84,7 +84,8 @@ cleaned_ep <- clean_endPoint_info(end_point_info) filtered_ep <- filter_groups(cleaned_ep) chemical_summary <- get_chemical_summary(tox_list, ACC, filtered_ep) -plot_tox_stacks(chemical_summary, tox_list$chem_site, "Biological") +plot_tox_stacks(chemical_summary, tox_list$chem_site, + "Biological", top_num = 5) \donttest{ plot_tox_stacks(chemical_summary, tox_list$chem_site, "Chemical Class") diff --git a/man/remove_flags.Rd b/man/remove_flags.Rd index 5941dba6..2027213c 100644 --- a/man/remove_flags.Rd +++ b/man/remove_flags.Rd @@ -4,17 +4,12 @@ \alias{remove_flags} \title{Remove endpoints with specific data quality flags from data} \usage{ -remove_flags( - ACC, - flagsShort = c("Borderline", "OnlyHighest", "GainAC50", "Biochemical", "ACCLessThan") -) +remove_flags(ACC, flag_id = c(5, 6, 11, 15, 18)) } \arguments{ \item{ACC}{data frame with columns: casn, chnm, endPoint, and ACC_value} -\item{flagsShort}{vector of flags to to trigger REMOVAL of chemical:endPoint -combination. Possible values are "Borderline", "OnlyHighest", "OneAbove", -"Noisy", "HitCall", "GainAC50", "Biochemical","LessThan50","ACCLessThan","GNLSmodel".} +\item{flag_id}{vector of flags to to trigger REMOVAL} } \description{ Through the ToxCast program quality assurance procedures, information @@ -26,17 +21,20 @@ to them can be removed based on their designated flag with the \code{remove_flags} function. The flags included in ToxCast, and the associated flagsShort value (used in the remove_flags function) are as follows: \tabular{ll}{ -\strong{Flag} \tab \strong{flagsShort}\cr -Borderline active* \tab Borderline* \cr -Only highest conc above baseline, active* \tab OnlyHighest* \cr -Only one conc above baseline, active \tab OneAbove \cr -Noisy data \tab Noisy \cr -Hit-call potentially confounded by overfitting \tab HitCall \cr -Gain AC50 < lowest conc & loss AC50 < mean conc* \tab GainAC50* \cr -Biochemical assay with < 50\% efficacy* \tab Biochemical* \cr -Less than 50\% efficacy \tab LessThan50 \cr -AC50 less than lowest concentration tested* \tab ACCLessThan* \cr -GNLSmodel \tab GNLSmodel \cr +\strong{flag_id} \tab \strong{Full Name}\cr +5* \tab Model directionality questionable \cr +6* \tab Only highest conc above baseline, active \cr +7 \tab Only one conc above baseline, active \cr +8 \tab Multiple points above baseline, inactive \cr +9 \tab Bmd > ac50, indication of high baseline variability \cr +10 \tab Noisy data \cr +11* \tab Borderline \cr +15* \tab Gain AC50 < lowest conc & loss AC50 < mean conc \cr +17 \tab Less than 50\% efficacy \cr +18* \tab AC50 less than lowest concentration tested \cr +13 \tab Average number of replicates per conc is less than 2 \cr +14 \tab Number of concentrations tested is less than 4 \cr +19 \tab Cell viability assay fit with gnls winning model \cr } Asterisks indicate flags removed in the function as default. } @@ -44,6 +42,11 @@ Asterisks indicate flags removed in the function as default. CAS <- c("121-00-6", "136-85-6", "80-05-7", "84-65-1", "5436-43-1", "126-73-8") ACC <- get_ACC(CAS) nrow(ACC) + +# See available flags and associated ids: + +flags + ACC <- remove_flags(ACC) nrow(ACC) } diff --git a/man/tox_chemicals.Rd b/man/tox_chemicals.Rd index 759d0a3c..4065fe7a 100644 --- a/man/tox_chemicals.Rd +++ b/man/tox_chemicals.Rd @@ -5,30 +5,11 @@ \alias{tox_chemicals} \title{ToxCast Chemical Information} \value{ -data frame with the following columns: -\tabular{ll}{ -Column \tab Description \cr -DSSTox_Substance_Id \tab DSSTox_Substance_Id\cr -Substance_Name \tab Commen chemical name \cr -Structure_MolWt \tab Molecular weight \cr -DTXCID \tab DTXCID\cr -Substance_CASRN \tab CASRN \cr -INCHIKEY \tab INCHIKEY\cr -SMILES \tab SMILES\cr -Total_tested \tab Total number of ToxCast assays tested\cr -Active \tab Number of ToxCast assays flagged as active \cr -min_concentration \tab Minimum concentration tested in ToxCast (ug/L) \cr -max_concentration \tab Maximum concentration tested in ToxCast (ug/L) \cr -} - -data frame with columns: -"Substance_Name","Substance_CASRN", -"Structure_MolWt" +data frame } \description{ -Downloaded from the CompTox database on October 2022. -\url{https://comptox.epa.gov/dashboard/}. Additional columns were -added based on the information from the "INVITRODB_V3_5_LEVEL5" data. +See \code{vignette("Setting up toxEval package data", package = "toxEval")} +for more information on how the data was aggregated. } \examples{ head(tox_chemicals) diff --git a/tests/testthat/test_data.R b/tests/testthat/test_data.R index 2cf0f63a..081832e2 100644 --- a/tests/testthat/test_data.R +++ b/tests/testthat/test_data.R @@ -4,33 +4,19 @@ context("Data") test_that("Check included data", { ToxCast_ACC <- ToxCast_ACC - expect_true(all(names(ToxCast_ACC) %in% c("CAS", + expect_true(all(names(ToxCast_ACC) %in% c("casn", "flags", - "endPoint", - "ACC"))) + "aeid", + "hit_val"))) - expect_true(is.numeric(ToxCast_ACC$ACC)) - expect_true(is.character(ToxCast_ACC$CAS)) - expect_true(is.character(ToxCast_ACC$flags)) - expect_true(is.character(ToxCast_ACC$endPoint)) + expect_true(is.numeric(ToxCast_ACC$hit_val)) + expect_true(is.character(ToxCast_ACC$casn)) + expect_true(is.numeric(ToxCast_ACC$aeid)) - CAS <- unique(ToxCast_ACC$CAS) + CAS <- unique(ToxCast_ACC$casn) ACC <- get_ACC(CAS) - all_flags <- c( - "Borderline", - "OnlyHighest", - "OneAbove", - "Noisy", - "HitCall", - "GainAC50", - "Biochemical", - "LessThan50", - "ACCLessThan", - "GNLSmodel" - ) - - all_gone <- remove_flags(ACC, all_flags) + all_gone <- remove_flags(ACC, unique(flags$flag_id)) expect_true(all(is.na(all_gone$flags))) end_point_info <- end_point_info @@ -44,11 +30,11 @@ test_that("Check included data", { "assay_name", "assay_component_name", "assay_component_endpoint_name", - "intended_target_gene_id", - "intended_target_gene_name", - "intended_target_gene_symbol", - "signal_direction", - "analysis_direction", + # "intended_target_gene_id", + # "intended_target_gene_name", + # "intended_target_gene_symbol", + # "signal_direction", + # "analysis_direction", "biological_process_target") expect_true(all(tm_cols %in% names(end_point_info))) @@ -64,12 +50,12 @@ test_that("Check included data", { c(default_eps, "BSK"))) tox_chemicals <- tox_chemicals - expect_true(all(c("Substance_CASRN", + + expect_true(all(c("casn", + "dsstox_substance_id", "Structure_MolWt", - "Substance_Name", + "chnm", "Total_tested", - "Active", - "min_concentration", - "max_concentration") %in% names(tox_chemicals))) + "Active") %in% names(tox_chemicals))) }) \ No newline at end of file diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index 8534324a..be80e5b6 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -39,9 +39,3 @@ test_that("Labels", { }) -test_that("Complete", { - testthat::skip_on_cran() -}) - -# get_complete_set_category -# get_complete_set diff --git a/tests/testthat/tests_endpoint.R b/tests/testthat/tests_endpoint.R index b9908855..c1eae284 100644 --- a/tests/testthat/tests_endpoint.R +++ b/tests/testthat/tests_endpoint.R @@ -21,7 +21,9 @@ test_that("Getting ACC values", { testthat::skip_on_cran() ACC <- get_ACC(CAS) - expect_true(all(names(ACC) %in% c("CAS", "chnm", "flags", "endPoint", "ACC", "ACC_value", "MlWt"))) + expect_true(all(names(ACC) %in% + c("CAS", "chnm", "flags", "hit_val", "aeid", + "endPoint", "ACC", "ACC_value", "MlWt"))) expect_type(ACC$ACC_value, "double") expect_type(ACC$endPoint, "character") @@ -35,19 +37,8 @@ test_that("Removing flags", { testthat::skip_on_cran() ACC <- get_ACC(CAS) - ACC_noFlags <- remove_flags(ACC, - flagsShort = c( - "Borderline", - "OnlyHighest", - "OneAbove", - "Noisy", - "HitCall", - "GainAC50", - "Biochemical", - "ACCLessThan", - "LessThan50", - "GNLSmodel" - ) + ACC_noFlags <- remove_flags(ACC, + flag_id = flags$flag_id ) expect_lt(nrow(ACC_noFlags), nrow(ACC)) expect_true(all(is.na(ACC_noFlags$flags))) @@ -59,7 +50,6 @@ test_that("Cleaning up endpoints", { # based on first paper: cleaned_ep <- clean_endPoint_info(end_point_info) expect_equal(names(cleaned_ep), names(end_point_info)) - expect_lt(nrow(cleaned_ep), nrow(end_point_info)) cleanedNames <- c( "Nuclear Receptor", "Cell Cycle", "Cell Morphology", @@ -90,15 +80,15 @@ test_that("Filtering endpoints", { cleaned_ep <- clean_endPoint_info(end_point_info) - assays <- c("ATG", "NVS", "OT") + assays <- c("ATG", "NVS", "OT", "BSK") groups <- c("Background Measurement", "Undefined", "Cell Cycle") filtered_ep <- filter_groups(cleaned_ep, groupCol = "intended_target_family", - assays = assays, + remove_assays = assays, remove_groups = groups ) - expect_true(all(unique(filtered_ep$assaysFull) %in% assays)) + expect_true(all(!assays %in% unique(filtered_ep$assaysFull))) expect_true(!(any(unique(filtered_ep$groupCol) %in% groups))) }) diff --git a/tests/testthat/tests_summary.R b/tests/testthat/tests_summary.R index db3f19c6..7851d1c6 100644 --- a/tests/testthat/tests_summary.R +++ b/tests/testthat/tests_summary.R @@ -177,7 +177,7 @@ test_that("Table functions", { "Cell Cycle freq", "Cell Cycle maxEAR" ) %in% names(statStuff))) - expect_equal(round(statStuff[["DNA Binding maxEAR"]][which(statStuff[["site"]] == "Raisin")], 4), 0.0278) + expect_equal(round(statStuff[["DNA Binding maxEAR"]][which(statStuff[["site"]] == "Raisin")], 4), 0.0301) expect_equal(round(statStuff[["DNA Binding freq"]][which(statStuff[["site"]] == "Raisin")], 4), 0) @@ -185,7 +185,7 @@ test_that("Table functions", { expect_true(all(unique(groupStuff$site) %in% chem_site$`Short Name`)) expect_true(all(c("site", "category", "Samples with hits", "Number of Samples") %in% names(groupStuff))) expect_equal(groupStuff[["Samples with hits"]][which(groupStuff[["site"]] == "Raisin" & - groupStuff[["category"]] == "Nuclear Receptor")], 27) + groupStuff[["category"]] == "Nuclear Receptor")], 29) expect_equal(groupStuff[["Number of Samples"]][which(groupStuff[["site"]] == "Raisin" & groupStuff[["category"]] == "DNA Binding")], 44) @@ -197,11 +197,11 @@ test_that("Chem plotting functions", { graphData <- graph_chem_data(chemical_summary) expect_true(all(names(graphData) %in% c("site", "chnm", "Class", "meanEAR"))) expect_equal(levels(graphData$Class)[1], "Herbicides") - expect_equal(levels(graphData$Class)[length(levels(graphData$Class))], "Fuels") + expect_equal(levels(graphData$Class)[length(levels(graphData$Class))], "Dyes/Pigments") expect_equal(signif(graphData[["meanEAR"]][graphData[["site"]] == "USGS-04024000" & - graphData[["chnm"]] == "Naphthalene"], 4), 0.002537) + graphData[["chnm"]] == "Naphthalene"], 4), 0.0004799) expect_equal(signif(graphData[["meanEAR"]][graphData[["site"]] == "USGS-04024000" & - graphData[["chnm"]] == "Cumene"], 4), 6.665e-06) + graphData[["chnm"]] == "Anthraquinone"], 4), 6.131e-05) }) test_that("Map stuff functions", { @@ -214,7 +214,7 @@ test_that("Map stuff functions", { expect_type(mapDataList, "list") expect_equal(length(mapDataList), 2) map_df <- mapDataList[["mapData"]] - expect_equal(signif(map_df[["meanMax"]][map_df[["Short Name"]] == "StLouis"], 4), 2.207) + expect_equal(signif(map_df[["meanMax"]][map_df[["Short Name"]] == "StLouis"], 4), 1.891) expect_equal(map_df[["count"]][map_df[["Short Name"]] == "StLouis"], 31) expect_equal(map_df[["sizes"]][map_df[["Short Name"]] == "StLouis"], 7.2) @@ -232,17 +232,17 @@ test_that("Table endpoint hits", { bt <- endpoint_hits_DT(chemical_summary, category = "Biological") expect_type(bt, "list") expect_true(all(names(bt$x$data) %in% c( - "endPoint", "Nuclear Receptor", "CYP", - "Cell Cycle", "Steroid Hormone", "Esterase" + "endPoint", "Nuclear Receptor", "CYP", "Lyase", + "Steroid Hormone", "Esterase", "Metabolite" ))) expect_true(all(class(bt) %in% c("datatables", "htmlwidget"))) bt_df <- endpoint_hits(chemical_summary, category = "Biological") expect_true(all(names(bt_df) %in% c( - "endPoint", "Nuclear Receptor", "CYP", - "Cell Cycle", "Esterase", "Steroid Hormone" + "endPoint", "Nuclear Receptor", "CYP", "Lyase", + "Steroid Hormone", "Esterase", "Metabolite" ))) - expect_equal(bt_df[["Nuclear Receptor"]][bt_df[["endPoint"]] == "OT_ER_ERaERb_1440"], 3) + expect_equal(bt_df[["Nuclear Receptor"]][bt_df[["endPoint"]] == "OT_ER_ERaERb_1440"], 1) expect_true(is.na(bt_df[["Esterase"]][bt_df[["endPoint"]] == "OT_ER_ERbERb_0480"])) expect_error(endpoint_hits_DT(chemical_summary, category = "Class")) @@ -251,9 +251,9 @@ test_that("Table endpoint hits", { expect_type(ct, "list") expect_true(all(names(ct$x$data) %in% c( - "endPoint", "Antioxidants", "PAHs", + "endPoint", "Antioxidants", "Detergent Metabolites", "Herbicides", - "Plasticizers" + "Antimicrobial Disinfectants" ))) cht <- endpoint_hits_DT(chemical_summary, category = "Chemical") @@ -262,7 +262,7 @@ test_that("Table endpoint hits", { expect_true(all(names(cht$x$data) %in% c( "endPoint", "Bisphenol A", "Metolachlor", "4-Nonylphenol, branched", "Pyrene", - "Phenanthrene", "Atrazine" + "Triclosan", "Atrazine" ))) }) @@ -277,11 +277,12 @@ test_that("hits_by_groupings_DT", { bt_df <- hits_by_groupings(chemical_summary, category = "Chemical Class") expect_true(all(names(bt_df) %in% c( "Nuclear Receptor", "DNA Binding", "Cell Cycle", "Esterase", - "Steroid Hormone", "Channel 2", "CYP", "Transporter" + "Steroid Hormone", "Channel 2", "CYP", "Transporter", "Lyase", + "Metabolite" ))) expect_true(all(c("Detergent Metabolites", "Antioxidants", "Herbicides") %in% rownames(bt_df))) - expect_equal(bt_df[["Nuclear Receptor"]], c(8, 10, 16, 2, rep(0, 10))) + expect_equal(bt_df[["Nuclear Receptor"]], c(8, 10, 16, rep(0, 9), NA)) expect_error(hits_by_groupings_DT(chemical_summary, category = "Class")) @@ -290,8 +291,8 @@ test_that("hits_by_groupings_DT", { expect_true(all(class(ct) %in% c("datatables", "htmlwidget"))) expect_true(all(names(ct$x$data) %in% c( " ", "Nuclear Receptor", "DNA Binding", "Cell Cycle", - "Esterase", "CYP", "Steroid Hormone", - "Transporter", "Channel 2" + "Esterase", "CYP", "Steroid Hormone", "Lyase", + "Transporter", "Channel 2", "Metabolite" ))) @@ -301,7 +302,7 @@ test_that("hits_by_groupings_DT", { expect_true(all(names(cht$x$data) %in% c( " ", "Nuclear Receptor", "DNA Binding", "Cell Cycle", "CYP", "Esterase", "Steroid Hormone", - "Channel 2", "Transporter" + "Channel 2", "Transporter", "Lyase", "Metabolite" ))) }) @@ -365,7 +366,7 @@ test_that("Calculating completness", { graphData2 <- tox_boxplot_data(chemical_summary, "Biological") complete_df_cat <- toxEval:::get_complete_set_category(chemical_summary, graphData2, tox_list$chem_site, category = "Biological") - expect_equal(nrow(complete_df_cat), 2394) + expect_equal(nrow(complete_df_cat), 2451) }) test_that("Calculating concentrations", { @@ -402,16 +403,13 @@ test_that("Testing levels", { expect_equal(chem_levels[1:5], c( - "Cumene", - "1-Methylnaphthalene", - "Anthraquinone", - "Tetrachloroethylene", - "Isophorone" + "Anthraquinone", "Tetrachloroethylene", "Isophorone", + "1,4-Dichlorobenzene", "Bromoform" )) expect_equal(class_levels[1:5], c( "Herbicides", "Detergent Metabolites", - "Antioxidants", "PAHs", - "Antimicrobial Disinfectants" + "Antioxidants", "Antimicrobial Disinfectants", + "Fire Retardants" )) plot_eps <- plot_tox_endpoints(chemical_summary, @@ -422,11 +420,11 @@ test_that("Testing levels", { expect_equal( tail(levels(plot_eps$data$endPoint), 5), c( - "LTEA_HepaRG_GCLC_dn", - "LTEA_HepaRG_NFE2L2_dn", - "LTEA_HepaRG_GSTA2_dn", - "TOX21_PPARg_BLA_Antagonist_ch2", - "TOX21_RT_HEPG2_FLO_00hr_ctrl_viability" + "LTEA_HepaRG_EZR", + "NVS_NR_mERa", + "LTEA_HepaRG_HSPA1A", + "LTEA_HepaRG_GSTM3", + "CLD_HMGCS2_48hr" ) ) @@ -440,8 +438,9 @@ test_that("Testing levels", { levels(plot_stack$data$category), c( "Bisphenol A", "4-Nonylphenol, branched", - "Phenanthrene", "Atrazine", - "Pentachlorophenol", "Others (44)" + "Atrazine", + "Pentachlorophenol", "Metolachlor", + "Others (42)" ) ) @@ -453,8 +452,11 @@ test_that("Testing levels", { expect_equal( tail(levels(plot_heat$data$chnm), 5), c( - "Bromacil", "Metolachlor", "Atrazine", - "Prometon", "Pentachlorophenol" + "Bromacil", + "Metalaxyl", + "Metolachlor", + "Atrazine", + "Pentachlorophenol" ) ) }) diff --git a/vignettes/Chemical_names.Rmd b/vignettes/Chemical_names.Rmd index 4f308716..4feda0df 100644 --- a/vignettes/Chemical_names.Rmd +++ b/vignettes/Chemical_names.Rmd @@ -19,7 +19,7 @@ knitr::opts_chunk$set(echo = TRUE, message = FALSE) ``` -Up until version 1.3.1, `toxEval` only required "CAS" and "Class" in the Chemicals tab of the data (see \code{vignette("PrepareData", package = "toxEval")}). In those previous versions of `toxEval`, the chemical names were taken from the `Substance_Name` in the included data frame \code{tox_chemicals}. The information in that table (including substance name) come from the ToxCast database. +Up until version 1.3.1, `toxEval` only required "CAS" and "Class" in the Chemicals tab of the data (see `vignette("PrepareData", package = "toxEval")`. In those previous versions of `toxEval`, the chemical names were taken from the `Substance_Name` in the included data frame \code{tox_chemicals}. The information in that table (including substance name) come from the ToxCast database. Many users are using `toxEval` for user-curated benchmarks workflows, and the auto-generated chemical names became difficult to work with. So going forward from version 1.3.1, the chemical names seen in tables and figures come from the "Chemical" tab. @@ -42,8 +42,8 @@ tox_chemicals <- tox_chemicals chem_info_with_names <- chem_info %>% left_join(select(tox_chemicals, - CAS = Substance_CASRN, - Chemical = Substance_Name), + CAS = casn, + Chemical = chnm), by = "CAS") head(chem_info_with_names) diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd index 7140701a..89689bbe 100644 --- a/vignettes/Introduction.Rmd +++ b/vignettes/Introduction.Rmd @@ -11,15 +11,16 @@ vignette: > \usepackage[utf8]{inputenc} --- -# Introduction +The `toxEval` R-package includes a set of functions to analyze, visualize, and organize measured concentration data as it relates to [ToxCast data](https://www.epa.gov/comptox-tools/toxicity-forecasting-toxcast) (default) or other user-selected chemical-biological interaction benchmark data such as water quality criteria. The intent of these analyses is to develop a better understanding of the potential biological relevance of environmental chemistry data. Results can be used to prioritize which chemicals at which sites may be of greatest concern. These methods are meant to be used as a screening technique to predict potential for biological influence from chemicals that ultimately need to be validated with direct biological assays. Full documentation of this R package including a tutorial with examples is available here: + -The `toxEval` R-package includes a set of functions to analyze, visualize, and organize measured concentration data as it relates to [ToxCast data](https://www.epa.gov/comptox-tools/toxicity-forecasting-toxcast) (default) or other user-selected chemical-biological interaction benchmark data such as water quality criteria. The intent of these analyses is to develop a better understanding of the potential biological relevance of environmental chemistry data. Results can be used to prioritize which chemicals at which sites may be of greatest concern. These methods are meant to be used as a screening technique to predict potential for biological influence from chemicals that ultimately need to be validated with direct biological assays. The functions within this package allow great flexibly for exploring the potential biological affects of measured chemicals. Also included in the package is a browser-based application made from the `Shiny` R-package (the app). The app is based on functions within the R-package and includes many convenient analyses and visualization options for users to choose. Use of the functions within the R-package allows for additional flexibility within the functions beyond what the app offers and provides options for the user to interact more directly with the data. The overview in this document focuses on the R-package. Documentation for the app is provided [here](https://doi-usgs.github.io/toxEval/articles/shinyApp.html). This vignette provides a general overview of the concepts within `toxEval`, definitions of common terminology used throughout the package, and links to information to help understand fundamentals of the ToxCast database used within `toxEval`. ## What is ToxCast? + The U.S. EPA's Toxicity Forecaster ToxCast includes a database of chemical-biological interactions that contains information from hundreds of assays on thousands of chemicals, providing a means to assess biological relevance to measured concentrations. The `toxEval` package attempts to simplify the workflow for exploring data as it relates to these assay endpoints (benchmark data). The workflow uses ToxCast as a default for evaluation of chemical:biological interactions, but the user may also define alternative benchmarks for a custom or more traditional approach to biological relevance evaluation. This is also a useful capability for efficient comparison of ToxCast evaluation results with those from other toxicity benchmark databases. When using the ToxCast endPoints for analysis, it is important to have at least a minimal understanding of what ToxCast data is, and which ToxCast data is relevant to any given study. There are many useful resources here. There is also a tool called the Comptox Dashboard that has a wealth of information on ToxCast data. @@ -79,7 +80,7 @@ Some functions will also include a calculation for a "hit". A threshold is defin ## Reporting bugs If you discover an issue that you feel is a bug in the package or have a question on functionality, please consider reporting bugs and asking questions on the Issues page: -[https://github.com/DOI-USGS/toxEval/issues](https://github.com/DOI-USGS/toxEval/issues) + ## Citing toxEval diff --git a/vignettes/OWC_custom_bench.xlsx b/vignettes/OWC_custom_bench.xlsx index 093544c7..254fd9b2 100644 Binary files a/vignettes/OWC_custom_bench.xlsx and b/vignettes/OWC_custom_bench.xlsx differ diff --git a/vignettes/PrepareData.Rmd b/vignettes/PrepareData.Rmd index 85a9caf9..7a46f129 100644 --- a/vignettes/PrepareData.Rmd +++ b/vignettes/PrepareData.Rmd @@ -5,17 +5,21 @@ output: rmarkdown::html_vignette: toc: true number_sections: false + fig_width: 5 + fig_height: 6 vignette: > - %\VignetteEngine{knitr::rmarkdown} %\VignetteIndexEntry{Preparing toxEval Data} \usepackage[utf8]{inputenc} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console --- # Introduction What kind of data could be used in a `toxEval` analysis? It was designed with concentration measurements from water samples as the primary use case. There may be other concentration measurements that could be used as well, but it is up to the researcher to determine if special considerations must be taken in those circumstances. For instance, there was a `toxEval` analysis done on the concentration of chemicals measured in eagle plasma. -For all cases within `toxEval`, a "sample" is considered a unique site/date. There are times when this might not be especially relavent to the data collection (passive samplers, groundwater samples at separate depths, etc.). The user will need to come up with strategies to deal with the limiting workflow. For example, single sites at different depths could add site suffixes (site_a_3m, site_a_6m, etc.). Passive samplers could use the start or end time as the sampling times. +For all cases within `toxEval`, a "sample" is considered a unique site/date. There are times when this might not be especially relevant to the data collection (passive samplers, groundwater samples at separate depths, etc.). The user will need to come up with strategies to deal with the limiting workflow. For example, single sites at different depths could add site suffixes (site_a_3m, site_a_6m, etc.). Passive samplers could use the start or end time as the sampling times. If a suffix is added in one tab of the data, it must be added to all tabs of the data. So, SiteID must match in the Data and Sites tabs. # Preparing the data @@ -51,13 +55,13 @@ knitr::include_graphics("data.png") ## Chemicals -The "Chemicals" sheet is used to define the unique chemicals included in the "Data" sheet (so, 1 row per unique chemical). Two columns are required in this sheet: "CAS" and "Class". The columns can be in any order, but the first row of the sheet must be the header (column names). If you need chemical names that do not match up wiht the "tox_chemical" list provided in the package, you will want to include a 3rd column "Chemical" which is the chemical name to use for plots and tables. +The "Chemicals" sheet is used to define the unique chemicals included in the "Data" sheet (so, 1 row per unique chemical). Two columns are required in this sheet: "CAS" and "Class". The columns can be in any order, but the first row of the sheet must be the header (column names). If you need chemical names that do not match up with the "tox_chemical" list provided in the package, you will want to include a 3rd column "Chemical" which is the chemical name to use for plots and tables. * CAS: A character column defining the chemicals via their Chemical Abstracts Service (CAS) registry. In the Excel file, pay special attention that no CAS are converted to a Date format. Highlight the column, right-click on the mouse, choose "Format Cells", and choose "Text" as the category to assure they retain their format. The unique CAS values in this column *must* match with the CAS values in the "Data" sheet. * Class: A character column defining the class of chemicals. Most `toxEval` functions will allow groupings by either chemical, class (as defined here), or biological grouping. In the example data set provided with the package, the chemicals were organized in classes such as "Fuels", "Herbicides", "Insecticides", etc. The choice of classes is up to the researcher. Chemical class analysis can be ignored, and in that case, it is just important to put a single repeating entry in the "Class" column. -* Chemical: A character column defining the name of the chemical. These are the chemical names used for most figures and plots. +* Chemical: A character column defining the name of the chemical. These are the chemical names used for most figures and plots. **This is a new requirement as of toxEval version 1.3.1.** Note: Additional columns may be useful to organize the data. These additional columns will be ignored by `toxEval` and will not influence a `toxEval` analysis. @@ -83,7 +87,46 @@ At times, it may be appropriate to exclude endpoints, chemicals, or specific end The "Exclude" sheet is optional, but if used, two columns are required: "CAS" and "endPoint". They can be in any order, but the first row of the sheet should be the header (column names). -Why would you choose to exclude a chemical/endpoint value? There are times that the dose-response curves from ToxCast may not trigger any automated flags, but upon inspection, the curves seem suspect. The easiest way to view the dose response curves is from the [Comptox](https://comptox.epa.gov/dashboard) dashboard. The function `endpoint_hits_DT` includes an option to get a direct link to find the dose-response curves if the category is "Chemical". This is handy to do quick checks on the endpoint/chemical combinations that produce the highest EARs. If the highest EAR values have dose-response curves that seem suspect, consider adding those to the "Exclude" tab, or at least trying to get more information on that endpoint/assay. +Why would you choose to exclude a chemical/endpoint value? There are times that the dose-response curves from ToxCast may not trigger any automated flags, but upon inspection, the curves seem suspect (NOTE: as of ToxCast database 4.1, there are more flags available and excluding based on flag does appear to capture most cases that should be excluded). + +The easiest way to view the dose response curves is from the [Comptox](https://comptox.epa.gov/dashboard) dashboard. The function `endpoint_hits_DT` includes an option to get a direct link to find the dose-response curves if the category is "Chemical". This is handy to do quick checks on the endpoint/chemical combinations that produce the highest EARs. If the highest EAR values have dose-response curves that seem suspect, consider adding those to the "Exclude" tab, or at least trying to get more information on that endpoint/assay. + +Another way to quickly check which endpoints to check on Comtox is to plot the endpoints for a single chemical using the ACC values provided in `toxEval`. The LOWER the endpoint's ACC value, the higher the EAR will be, and therefore those are the endpoints you might want to check carefully: + +```{r} +library(toxEval) +library(ggplot2) + +CAS <- c("1912-24-9") +default_id = c(5, 6, 11, 15, 18) + +ACC_atrazine <- get_ACC(CAS) |> + dplyr::arrange(ACC_value) |> + dplyr::mutate(index = dplyr::row_number(), + percent = 100*index/dplyr::n(), + has_default_flag = colSums(sapply(flags, "%in%", x = default_id)) > 0) + +ggplot() + + geom_point(data = ACC_atrazine, + aes(x = ACC_value, y = percent, + color = has_default_flag)) + + xlab("ACC ug/L") + + ylab("Percentile") + + ggtitle("Individual ACC values for Atrazine") + + geom_text(data = head(ACC_atrazine |> + dplyr::filter(!has_default_flag), + n = 5), + aes(y = percent, x = ACC_value, + label = endPoint, + color = has_default_flag), + size = 2, hjust = -0.2, show.legend = FALSE) + + theme_bw() + + scale_color_manual("Includes default\nflags for removal", + values = c("blue", "grey80")) + +``` + +Figure 1: All ACC values in ug/L of atrazine from ToxCast. The values that would be flagged by default are in grey, the values that would remain are in blue. If there were any outliers that do not have default flags and have very low ACC values, those dose-response curves should be verified. * endPoint: A character column to define a specific endpoint to ignore in the analysis. If the "CAS" in the corresponding row is empty, the endPoint will be completely excluded from the `toxEval` analysis. For example, if it was decided that TOX21_p53_BLA_p3_ratio was not an appropriate endPoint to consider for a given analysis, it can be excluded by adding it to the Exclude sheet in the endPoint column. If the "CAS" in the corresponding row is NOT empty, only the specific chemical/endPoint combination will be excluded from the analysis. @@ -109,9 +152,10 @@ The "Benchmarks" sheet is optional, but if used, five columns are required: "CAS Note: Additional columns may be useful to organize the data. These additional columns will be ignored by `toxEval` and will not influence a `toxEval` analysis. +## Summary + +To summarize, there are 3 mandatory sheets (Data, Chemicals, Sites) and 2 option sheets (Exclude, Benchmarks). SiteID and CAS columns MUST match between each sheet. -## Disclaimer -This software has been approved for release by the U.S. Geological Survey (USGS). Although the software has been subjected to rigorous review, the USGS reserves the right to update the software as needed pursuant to further analysis and review. No warranty, expressed or implied, is made by the USGS or the U.S. Government as to the functionality of the software and related material nor shall the fact of release constitute any such warranty. Furthermore, the software is released on condition that neither the USGS nor the U.S. Government shall be held liable for any damages resulting from its authorized or unauthorized use. Any use of trade, firm, or product names is for descriptive purposes only and does not imply endorsement by the U.S. Government. \ No newline at end of file diff --git a/vignettes/basicWorkflow.Rmd b/vignettes/basicWorkflow.Rmd index 16a88d25..e3703bb3 100644 --- a/vignettes/basicWorkflow.Rmd +++ b/vignettes/basicWorkflow.Rmd @@ -74,22 +74,14 @@ The decision to use ACC values compared to AC50 (concentration at half-maximal a ## remove_flags -Through the ToxCast program quality assurance procedures, information is examined and at times, it is necessary to assign a data quality flag to a specific chemical:assay result. A `toxEval` user may want to include or exclude assay results with certain flags depending on the objectives of a given study. Assay results with specific data quality flags assigned to them can be removed based on their designated flag with the `remove_flags` function. The flags included in ToxCast, and the associated `flagsShort` value (used in the `remove_flags` function) are as follows: - -| Flags | flagsShort | -|----------|------:| -| Borderline active | Borderline | -| Only highest conc above baseline, active | OnlyHighest | -| Only one conc above baseline, active | OneAbove | -| Noisy data | Noisy | -| Hit-call potentially confounded by overfitting | HitCall | -| Gain AC50 < lowest conc & loss AC50 < mean conc | GainAC50 | -| Biochemical assay with < 50% efficacy | Biochemical | -| Less than 50% efficacy | LessThan50 | -| AC50 less than lowest concentration tested | ACCLessThan | -| Cell viability assay fit with gnls winning model | GNLSmodel | - -The function is written to exclude results with the flags "Borderline", "OnlyHighest", "GainAC50", "Biochemical" as a default. If the user prefers a different list, the full list must be specified (default values will be ignored). +Through the ToxCast program quality assurance procedures, information is examined and at times, it is necessary to assign a data quality flag to a specific chemical:assay result. A `toxEval` user may want to include or exclude assay results with certain flags depending on the objectives of a given study. Assay results with specific data quality flags assigned to them can be removed based on their designated flag with the `remove_flags` function. The flags included in ToxCast, and the associated `flag_id` value (used in the `remove_flags` function) are as follows: + +```{r echo=FALSE} +df <- flags + +knitr::kable(df) + +``` This function is specific to the provided data from the ToxCast database (and therefore not customizable to other toxicity data sets). The flags listed above are the only options within that set of data. Some endpoint/chemical combinations have multiple flags listed. @@ -122,8 +114,7 @@ By default, the BioSeek set of assays are removed. The list of assays and their end_point_info <- end_point_info df <- end_point_info %>% - select(`Assay Name` = assay_source_long_name, - `Short Name` = assay_source_name) %>% + select(`Assay Name` = assay_source_name) %>% distinct() knitr::kable(df) @@ -144,9 +135,7 @@ cleaned_ep <- clean_endPoint_info(end_point_info) filtered_ep <- filter_groups(cleaned_ep, groupCol = "intended_target_family", - assays = c("ATG","NVS", "OT", "TOX21", - "CEETOX", "APR", "CLD", "TANGUAY", - "NHEERL_PADILLA","NCCT_SIMMONS", "ACEA"), + remove_assays = c("BSK"), remove_groups = c("Background Measurement", "Undefined")) ``` @@ -601,6 +590,3 @@ plot_tox_heatmap(summary_with_levels, tox_list$chem_site, "Biological") ``` -## Disclaimer - -This software has been approved for release by the U.S. Geological Survey (USGS). Although the software has been subjected to rigorous review, the USGS reserves the right to update the software as needed pursuant to further analysis and review. No warranty, expressed or implied, is made by the USGS or the U.S. Government as to the functionality of the software and related material nor shall the fact of release constitute any such warranty. Furthermore, the software is released on condition that neither the USGS nor the U.S. Government shall be held liable for any damages resulting from its authorized or unauthorized use. diff --git a/vignettes/benchmarks.Rmd b/vignettes/benchmarks.Rmd index 73b452be..69171746 100644 --- a/vignettes/benchmarks.Rmd +++ b/vignettes/benchmarks.Rmd @@ -118,8 +118,8 @@ bench <- raw_benchmarks %>% rename(Value = value) %>% separate(source, c("groupCol", "endPoint"), sep = "_") %>% left_join(select(tox_chemicals, - CAS = Substance_CASRN, - Chemical = Substance_Name), by = "CAS") + CAS = casn, + Chemical = chnm), by = "CAS") bench$Chemical[is.na(bench$Chemical)] <- bench$chm_nm[is.na(bench$Chemical)] @@ -389,8 +389,8 @@ bench <- raw_benchmarks %>% rename(Value = value) %>% separate(source, c("groupCol", "endPoint"), sep = "_") %>% left_join(select(tox_chemicals, - CAS = Substance_CASRN, - Chemical = Substance_Name), by = "CAS") + CAS = casn, + Chemical = chnm), by = "CAS") bench$Chemical[is.na(bench$Chemical)] <- bench$chm_nm[is.na(bench$Chemical)] diff --git a/vignettes/flags.png b/vignettes/flags.png index fa29509c..e370bce4 100644 Binary files a/vignettes/flags.png and b/vignettes/flags.png differ diff --git a/vignettes/update_data.Rmd b/vignettes/update_data.Rmd new file mode 100644 index 00000000..51d71667 --- /dev/null +++ b/vignettes/update_data.Rmd @@ -0,0 +1,209 @@ +--- +title: "Setting up toxEval package data" +output: + rmarkdown::html_vignette: + toc: true + number_sections: false +vignette: > + %\VignetteIndexEntry{Setting up toxEval package data} + \usepackage[utf8]{inputenc} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, + eval = FALSE, + warning = FALSE, + message = FALSE) +library(tcpl) +``` + +`toxEval` includes ToxCast data to help simplify EAR calculations. Here are the instructions for getting the necessary columns from the ToxCast database using the `tcpl` package. This is NOT necessary for all `toxEval` users. However, if you are interested in extending the data or exploring additional fields, this might be useful. It is also a way to remember how to update the `toxEval` package when there are new ToxCast database updates. + +First you would need to download the database and set up a MySQL server/database. This is beyond the scope of this vigentte right now. + +Next, you need to connect to that local database: + +```{r} +library(tcpl) +library(toxEval) +library(tidyverse) + + +tcplConf(drvr = "MySQL", + user = "root", + pass = "my_super_secret_password", + host = "localhost", + db = "prod_internal_invitrodb_v4_1") + + +``` + +## ToxCast_ACC + +First, we'll need to load several tables to get our necessary columns: + +```{r} +mc5_raw <- tcplQuery("SELECT * FROM mc5") +mc4_raw <- tcplQuery("SELECT * FROM mc4") +mc5_chid <- tcplQuery("SELECT * FROM mc5_chid") +mc6_raw <- tcplQuery("SELECT * FROM mc6") +sample_raw <- tcplQuery("SELECT * FROM sample") +chemical_raw <- tcplQuery("SELECT * FROM chemical") +mc5_param <- tcplQuery("SELECT * FROM mc5_param") +``` + +Next, we'll use `dplyr` to join: + +```{r eval=FALSE} +ToxCast_ACC <- mc5_raw |> + filter(hitc >= 0.9) |> + select(-created_date, -modified_date, -modified_by) |> + left_join(mc4_raw |> + select(-created_date, -modified_date, -modified_by), + by = join_by(m4id, aeid)) |> + left_join(mc5_chid, by = join_by(m5id)) |> + filter(chid_rep == 1) |> + left_join(mc6_raw |> + select(-created_date, -modified_date, -modified_by), + by = join_by(m5id, m4id, aeid)) |> + left_join(sample_raw, + by = join_by(spid)) |> + left_join(chemical_raw, + by = join_by(chid)) |> + left_join(mc5_param |> + filter(hit_param == "acc"), + by = join_by(m5id, aeid)) |> + select(casn, hit_val, aeid, mc6_mthd_id) |> + group_by(casn, hit_val, aeid) |> # 1-to-many flags + summarise(flags = list(mc6_mthd_id)) |> + ungroup() + +names(ToxCast_ACC) + +``` + +``` +[1] "casn" "hit_val" "aeid" "flags" +``` + +## end_point_info + +Next, we'll populate the `end_point_info` file: + +```{r} + +assay_component_endpoint <- tcplQuery("SELECT * FROM assay_component_endpoint;") +assay <- tcplQuery("SELECT * FROM assay;") +assay_component <- tcplQuery("SELECT * FROM assay_component;") +assay_source <- tcplQuery("SELECT * FROM assay_source;") +gene <- tcplQuery("SELECT * FROM gene;") +intended_target <- tcplQuery("SELECT * FROM intended_target;") + +end_point_info_41 <- assay_component_endpoint |> + left_join(assay_component, by = join_by(acid)) |> + left_join(assay, by = join_by(aid)) |> + left_join(assay_source, by = join_by(asid)) |> + left_join(intended_target, by = join_by(aeid)) |> + left_join(gene, by = c("target_id" = "gene_id")) |> + filter(!is.na(aeid)) |> + select(aeid, acid, assay_component_endpoint_name, + assay_component_endpoint_desc, intended_target_type, + intended_target_family_sub, intended_target_family, + biological_process_target, tissue, + gene_symbol, assay_source_name) |> + group_by(across(c(-gene_symbol))) |> # 1-to-many genes + summarise(gene_symbol = paste(gene_symbol, collapse = ", ")) |> + ungroup() + +``` + +## tox_chemicals + +```{r} +tox_chemicals41 <- mc5_raw |> + select(-created_date, -modified_date, -modified_by) |> + left_join(mc4_raw |> + select(-created_date, -modified_date, -modified_by), + by = join_by(m4id, aeid)) |> + left_join(mc5_chid, by = join_by(m5id)) |> + filter(chid_rep == 1) |> + left_join(mc6_raw |> + select(-created_date, -modified_date, -modified_by), + by = join_by(m5id, m4id, aeid)) |> + left_join(sample_raw, + by = join_by(spid)) |> + left_join(chemical_raw, + by = join_by(chid)) |> + left_join(mc5_param |> + filter(hit_param == "acc"), + by = join_by(m5id, aeid)) |> + group_by(casn, chnm, dsstox_substance_id) |> + summarise(Total_tested = length(unique(aeid)), + Active = length(unique(aeid[hitc >= 0.9]))) |> + ungroup() |> + left_join(tox_chemicals_35 |> + select(casn = Substance_CASRN, + Structure_MolWt), + by = "casn") +``` + +We need molecular weights to convert the ACC values to concentrations. We can either join previous versions of the toxEval package, and/or get new values from the CompTox Dashboard via the batch search: + + + +To create a list of chemicals to input to the dashboard: + +```{r} +need_mlwt <- tox_chemicals41$dsstox_substance_id[is.na(tox_chemicals41$Structure_MolWt)] +#Bring this to Comptox: +data.table::fwrite(data.frame(need_mlwt), "need_mlwt.csv") +``` + + +```{r} +# Import the file from Comptox: +more_mlwts <- data.table::fread("CCD-Batch-Search.csv", + data.table = FALSE) |> + mutate(new_Structure_MolWt = as.numeric(AVERAGE_MASS)) |> + select(casn = CASRN, + dsstox_substance_id = DTXSID, + new_Structure_MolWt) + +tox_chemicals41 <- tox_chemicals41 |> + left_join(more_mlwts, + by = c("casn", "dsstox_substance_id")) |> + mutate(Structure_MolWt = if_else(is.na(Structure_MolWt), + new_Structure_MolWt, + Structure_MolWt)) |> + select(-new_Structure_MolWt) +``` + +## Flags + +A small table to describe ToxCast flags is also included: + +```{r} +flags <- mc6_raw |> + select(flag_id = mc6_mthd_id, + flag_full = flag) |> + distinct() + +``` + + +## sysdata.rda + +Finally we can save that data in the package: + +```{r} +ToxCast_ACC <- ToxCast_ACC_41 +tox_chemicals <- tox_chemicals41 +end_point_info <- end_point_info_41_rel + +save(ToxCast_ACC, tox_chemicals, end_point_info, flags, + file = "sysdata.rda", compress = "xz") + +```