From 1b04a24b4bface232ac0b05264a3883f0e843d5b Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Fri, 24 Mar 2023 10:34:23 +0100 Subject: [PATCH 01/15] Update vignette with custom gene set section. --- vignettes/multiGSEA.rmd | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/vignettes/multiGSEA.rmd b/vignettes/multiGSEA.rmd index 887e1f5..9af0954 100644 --- a/vignettes/multiGSEA.rmd +++ b/vignettes/multiGSEA.rmd @@ -356,6 +356,34 @@ knitr::kable( ) ``` +# Customizable gene sets + +In principle, `multiGSEA` can also be run as single/multi omics analysis on +custom gene sets. + +The `pathways` object storing the pathway features across multiple omics layers +is a nested list, and hence can be designed manually to fit ones purposes. + +In the following example, HALLMARK gene sets are retrieved from `MSigDB` and +used to create a transcriptomics input list: + +```{r msigdbr, eval=FALSE} + +library(msigdbr) +library(dplyr) + +hallmark <- msigdbr(species = "Rattus norvegicus", category = "H") %>% + dplyr::select( gs_name, ensembl_gene) %>% + dplyr::filter( !is.na(ensembl_gene)) %>% + unstack( ensembl_gene ~ gs_name) + +pathways <- list( "transcriptome" = hallmark) + +``` + +**Please note**, feature sets across multiple omics layers have to be in the +same order and their names have to be identical, see the example presented above. + # Session Information From 038391e012efe73d7013f4da31671544037c4013 Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Fri, 24 Mar 2023 10:46:22 +0100 Subject: [PATCH 02/15] Version bump in devel. --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7af6b9b..818301a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.9.1 +Version: 1.9.2 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")), From f56922b10dfc5125dc82a553cf4ef3d0920499cf Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Fri, 24 Mar 2023 20:37:45 +0100 Subject: [PATCH 03/15] Bug fix: include CAS and HDMB mapping in metabolites ID mapping --- R/feature_processing.R | 63 ++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/R/feature_processing.R b/R/feature_processing.R index 06fe7aa..2fa8efa 100644 --- a/R/feature_processing.R +++ b/R/feature_processing.R @@ -51,7 +51,7 @@ getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntyp } ## extract the features (genes/proteins or metabolites) from the pathway nodes. - features <- nodes(pathway, which = which) + features <- graphite::nodes(pathway, which = which) if (length(features) == 0) { return(list()) @@ -88,7 +88,17 @@ getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntyp maptype = "CID", returntype = returntype ) - mapped <- c(chebi, kegg, pubchem) + cas <- mapIDType( + features = features, keytype = "CAS", + maptype = "CAS", returntype = returntype + ) + + hmdb <- mapIDType( + features = features, keytype = "HMDB", + maptype = "HMDB", returntype = returntype + ) + + mapped <- c(chebi, kegg, pubchem, cas, hmdb) } return(mapped) @@ -412,7 +422,7 @@ getMultiOmicsFeatures <- function(dbs = c("all"), layer = c("all"), } pathways <- lapply(dbs, function(x) { - pathways(organism, x) + graphite::pathways(organism, x) }) names(pathways) <- dbs @@ -513,29 +523,20 @@ getMappedFeatures <- function(pathways, returnID = "SYMBOL", organism = "hsapien -#' Helper function to map only a subset of metabolite IDs -#' -#' This helper function becomes necessary since there are sometimes multiple ID -#' formats used in a single pathway definition. -#' -#' @param features List of metabolite feature IDs of the pathway. -#' @param keytype String specifying the ID format in pathway definition. -#' @param maptype String specifying the corresponding ID format in multiGSEA. -#' @param returntype String identifying the ID type that should be mapped. -#' -#' @return List of mapped metabolite IDs. + mapIDType <- function(features, keytype = "CHEBI", maptype = "ChEBI", returntype = "HMDB") { mapped <- c() ids <- gsub(paste0(keytype, ":"), "", features[grep(keytype, features)]) + if (returntype != maptype) { - mapped <- c(mapped, getMetaboliteMapping( + mapped <- getMetaboliteMapping( features = ids, keytype = maptype, returntype = returntype - )) + ) } else { - mapped <- c(mapped, ids) + mapped <- ids } return(mapped) @@ -667,3 +668,31 @@ initOmicsDataStructure <- function(layer = c("transcriptome", "proteome", "metab return(empty_structure) } + + + +#' Helper function to get all different metabolite ID formats +#' +#' This helper function extracts all used ID formats in all pathways +#' and returns a nested list for each pathway database. +#' +#' @param pathways List of pathway databases and their pathway definition. +#' +#' @return List of metabolite ID formats. +getMetaboliteIDformats <- function(pathways){ + + n1 <- lapply(names(pathways), function( dbs){ + + n2 <- lapply(pathways[[dbs]], function(p){ + + unique(gsub( ":.*", "", graphite::nodes(p, which = "metabolites"))) + + }) + + unique(unlist(n2)) + + }) + + names(n1) <- names(pathways) + return(n1) +} From 51a2c43c07b30ab9d5f82fb98a159015249b8aa1 Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Fri, 24 Mar 2023 20:48:40 +0100 Subject: [PATCH 04/15] Call imported function in the format package::function() --- R/enrichment_functions.R | 2 +- R/feature_processing.R | 6 +++--- R/utility_functions.R | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/R/enrichment_functions.R b/R/enrichment_functions.R index 525da88..892fb6c 100644 --- a/R/enrichment_functions.R +++ b/R/enrichment_functions.R @@ -38,7 +38,7 @@ multiGSEA <- function(pathways, ranks) { # Go through all omics layer. es <- lapply(names(pathways), function(omics) { - fgseaMultilevel(pathways[[omics]], ranks[[omics]], eps = 0) + fgsea::fgseaMultilevel(pathways[[omics]], ranks[[omics]], eps = 0) }) names(es) <- names(pathways) diff --git a/R/feature_processing.R b/R/feature_processing.R index 2fa8efa..8ed0abe 100644 --- a/R/feature_processing.R +++ b/R/feature_processing.R @@ -402,10 +402,10 @@ getMultiOmicsFeatures <- function(dbs = c("all"), layer = c("all"), ) } - pDBs <- pathwayDatabases() + pDBs <- graphite::pathwayDatabases() dbs0 <- pDBs %>% - filter(.data$species == organism) %>% - pull(.data$database) + dplyr::filter(.data$species == organism) %>% + dplyr::pull(.data$database) databases <- c("all", as.vector(dbs0)) if (sum(tolower(dbs) %in% databases) != length(dbs)) { diff --git a/R/utility_functions.R b/R/utility_functions.R index e7ebc99..1c3c979 100644 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -29,7 +29,7 @@ archivePath <- function(filename) { #' @importFrom rappdirs user_cache_dir archiveDir <- function() { - ad <- user_cache_dir("multiGSEA") + ad <- rappdirs::user_cache_dir("multiGSEA") if (!file.exists(ad)) { if (!dir.create(ad, FALSE, TRUE)) { From e02c483b05d27b261dafaab86c9de5e206b798fc Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Mon, 27 Mar 2023 08:33:15 +0200 Subject: [PATCH 05/15] Version bump in devel. --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 818301a..742ff11 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.9.2 +Version: 1.9.3 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")), From 3311d3e425d0510618249cb9c81aa8d2a1ace15c Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Tue, 28 Mar 2023 15:14:27 +0200 Subject: [PATCH 06/15] Bug fix: Add eps as parameter for multiGSEA function. --- DESCRIPTION | 2 +- NAMESPACE | 2 ++ R/enrichment_functions.R | 5 +++-- man/getMetaboliteIDformats.Rd | 18 ++++++++++++++++++ man/multiGSEA.Rd | 4 +++- 5 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 man/getMetaboliteIDformats.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 742ff11..e782240 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,7 @@ URL: https://github.com/yigbt/multiGSEA BugReports: https://github.com/yigbt/multiGSEA/issues Encoding: UTF-8 NeedsCompilation: no -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.0 Suggests: org.Hs.eg.db, org.Mm.eg.db, diff --git a/NAMESPACE b/NAMESPACE index 6d6747a..bf914ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,11 +15,13 @@ importFrom(AnnotationDbi,select) importFrom(dplyr,distinct) importFrom(dplyr,filter) importFrom(dplyr,pull) +importFrom(dplyr,select) importFrom(fgsea,fgseaMultilevel) importFrom(graphite,nodes) importFrom(graphite,pathwayDatabases) importFrom(graphite,pathways) importFrom(magrittr,"%>%") +importFrom(metaboliteIDmapping,metabolitesMapping) importFrom(metap,sumlog) importFrom(metap,sump) importFrom(metap,sumz) diff --git a/R/enrichment_functions.R b/R/enrichment_functions.R index 892fb6c..f371211 100644 --- a/R/enrichment_functions.R +++ b/R/enrichment_functions.R @@ -11,6 +11,7 @@ #' respective omics layer. #' @param ranks Nested list containing the measured and pre-ranked features for #' each omics layer. +#' @param eps This parameter sets the boundary for calculating the p value. #' #' @return Nested list containing the enrichment scores for each given pathway #' and omics layer. @@ -34,11 +35,11 @@ #' @importFrom fgsea fgseaMultilevel #' #' @export -multiGSEA <- function(pathways, ranks) { +multiGSEA <- function(pathways, ranks, eps = 0) { # Go through all omics layer. es <- lapply(names(pathways), function(omics) { - fgsea::fgseaMultilevel(pathways[[omics]], ranks[[omics]], eps = 0) + fgsea::fgseaMultilevel(pathways[[omics]], ranks[[omics]], eps = eps) }) names(es) <- names(pathways) diff --git a/man/getMetaboliteIDformats.Rd b/man/getMetaboliteIDformats.Rd new file mode 100644 index 0000000..646f594 --- /dev/null +++ b/man/getMetaboliteIDformats.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature_processing.R +\name{getMetaboliteIDformats} +\alias{getMetaboliteIDformats} +\title{Helper function to get all different metabolite ID formats} +\usage{ +getMetaboliteIDformats(pathways) +} +\arguments{ +\item{pathways}{List of pathway databases and their pathway definition.} +} +\value{ +List of metabolite ID formats. +} +\description{ +This helper function extracts all used ID formats in all pathways +and returns a nested list for each pathway database. +} diff --git a/man/multiGSEA.Rd b/man/multiGSEA.Rd index c13b28d..720ba54 100644 --- a/man/multiGSEA.Rd +++ b/man/multiGSEA.Rd @@ -4,7 +4,7 @@ \alias{multiGSEA} \title{Calculate pathway enrichment for multiple omics layer.} \usage{ -multiGSEA(pathways, ranks) +multiGSEA(pathways, ranks, eps = 0) } \arguments{ \item{pathways}{Nested list containing all pathway features for the @@ -12,6 +12,8 @@ respective omics layer.} \item{ranks}{Nested list containing the measured and pre-ranked features for each omics layer.} + +\item{eps}{This parameter sets the boundary for calculating the p value.} } \value{ Nested list containing the enrichment scores for each given pathway From b8c714161808eb726495c9c94fd0166adcbf06dc Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Tue, 28 Mar 2023 15:15:17 +0200 Subject: [PATCH 07/15] Version bump in devel. --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index e782240..c1e8e92 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.9.3 +Version: 1.9.4 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")), From 21a21b25cdfdcb1f4bc16add9a25b445cfdf777c Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Thu, 30 Mar 2023 09:00:05 +0200 Subject: [PATCH 08/15] Add documentation to mapIDType. Resolve import warning on select functions. --- NAMESPACE | 1 - R/feature_processing.R | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index bf914ec..06c8659 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,7 +15,6 @@ importFrom(AnnotationDbi,select) importFrom(dplyr,distinct) importFrom(dplyr,filter) importFrom(dplyr,pull) -importFrom(dplyr,select) importFrom(fgsea,fgseaMultilevel) importFrom(graphite,nodes) importFrom(graphite,pathwayDatabases) diff --git a/R/feature_processing.R b/R/feature_processing.R index 8ed0abe..f4baa5e 100644 --- a/R/feature_processing.R +++ b/R/feature_processing.R @@ -232,7 +232,7 @@ getGeneMapping <- function(features, keytype, org = "hsapiens", returntype = "SY #' #' getMetaboliteMapping(features, keytype = "KEGG", returntype = "CID") #' -#' @importFrom dplyr pull select filter distinct +#' @importFrom dplyr pull filter distinct #' @importFrom metaboliteIDmapping metabolitesMapping #' @importFrom magrittr %>% #' @@ -523,7 +523,17 @@ getMappedFeatures <- function(pathways, returnID = "SYMBOL", organism = "hsapien - +#' Helper function to map only a subset of metabolite IDs +#' +#' This helper function becomes necessary since there are sometimes multiple ID +#' formats used in a single pathway definition. +#' +#' @param features List of metabolite feature IDs of the pathway. +#' @param keytype String specifying the ID format in pathway definition. +#' @param maptype String specifying the corresponding ID format in multiGSEA. +#' @param returntype String identifying the ID type that should be mapped. +#' +#' @return List of mapped metabolite IDs. mapIDType <- function(features, keytype = "CHEBI", maptype = "ChEBI", returntype = "HMDB") { mapped <- c() From 41c8d4e7c2a5016ffb88a38ca6fefb803e35bef1 Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Thu, 30 Mar 2023 09:00:59 +0200 Subject: [PATCH 09/15] Version bump in devel. --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c1e8e92..14beb1a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.9.4 +Version: 1.9.5 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")), @@ -15,6 +15,7 @@ Imports: magrittr, graphite, AnnotationDbi, + metaboliteIDmapping, dplyr, fgsea, metap, @@ -40,7 +41,6 @@ Suggests: org.Gg.eg.db, org.Xl.eg.db, org.Cf.eg.db, - metaboliteIDmapping, knitr, rmarkdown, BiocStyle, From 8e761e85b229ec98ea1c99691674f6b0b14ddf38 Mon Sep 17 00:00:00 2001 From: J Wokaty Date: Tue, 25 Apr 2023 11:19:42 -0400 Subject: [PATCH 10/15] bump x.y.z version to even y prior to creation of RELEASE_3_17 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 14beb1a..956d6e3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.9.5 +Version: 1.10.0 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")), From e2246dfc3dbf1c33ac44eedab285d8dc5cce4be6 Mon Sep 17 00:00:00 2001 From: J Wokaty Date: Tue, 25 Apr 2023 11:19:42 -0400 Subject: [PATCH 11/15] bump x.y.z version to odd y following creation of RELEASE_3_17 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 956d6e3..269f693 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.10.0 +Version: 1.11.0 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")), From a742030aed65bfee53ddfd77d1e7f1be49bf4688 Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Thu, 27 Apr 2023 09:04:39 +0200 Subject: [PATCH 12/15] Apply styler to multiGSEA package. --- R/enrichment_functions.R | 139 +++-- R/feature_processing.R | 628 +++++++++++------------ R/utility_functions.R | 78 ++- man/extractPvalues.Rd | 4 +- man/getFeatures.Rd | 2 +- man/getGeneMapping.Rd | 6 +- man/getMetaboliteIDformats.Rd | 2 +- man/getMultiOmicsFeatures.Rd | 20 +- man/loadLocal.Rd | 2 +- man/rankFeatures.Rd | 1 + man/rename_duplicates.Rd | 4 +- tests/testthat/test_combinePvalues.R | 56 +- tests/testthat/test_pathway_enrichment.R | 84 +-- tests/testthat/test_pathway_features.R | 52 +- vignettes/multiGSEA.rmd | 183 +++---- 15 files changed, 613 insertions(+), 648 deletions(-) diff --git a/R/enrichment_functions.R b/R/enrichment_functions.R index f371211..2eb55bf 100644 --- a/R/enrichment_functions.R +++ b/R/enrichment_functions.R @@ -36,16 +36,14 @@ #' #' @export multiGSEA <- function(pathways, ranks, eps = 0) { + # Go through all omics layer. + es <- lapply(names(pathways), function(omics) { + fgsea::fgseaMultilevel(pathways[[omics]], ranks[[omics]], eps = eps) + }) - # Go through all omics layer. - es <- lapply(names(pathways), function(omics) { - fgsea::fgseaMultilevel(pathways[[omics]], ranks[[omics]], eps = eps) - }) + names(es) <- names(pathways) - names(es) <- names(pathways) - - return(es) - + return(es) } @@ -80,31 +78,29 @@ multiGSEA <- function(pathways, ranks, eps = 0) { #' es <- multiGSEA(pathways, ranks) #' #' extractPvalues( -#' enrichmentScores = es, -#' pathwayNames = names(pathways[[1]]) +#' enrichmentScores = es, +#' pathwayNames = names(pathways[[1]]) #' ) #' @export extractPvalues <- function(enrichmentScores, pathwayNames) { - - # Go through all the pathways - res <- lapply(pathwayNames, function(name) { - - # Go through all the possible omics layer - unlist( lapply(names(enrichmentScores), function(y) { - df <- enrichmentScores[[y]][which(enrichmentScores[[y]]$pathway == name), c(2, 3)] - if (nrow(df) == 0) { - df <- data.frame(pval = NA, padj = NA) - } - names(df) <- paste0(y, "_", names(df)) - df - })) - }) - - # Combine the list elements to a data frame and assign the pathway names as rownames - res <- data.frame(do.call(rbind, res)) - - return(res) - + # Go through all the pathways + res <- lapply(pathwayNames, function(name) { + # Go through all the possible omics layer + unlist(lapply(names(enrichmentScores), function(y) { + df <- enrichmentScores[[y]][which(enrichmentScores[[y]]$pathway == name), c(2, 3)] + if (nrow(df) == 0) { + df <- data.frame(pval = NA, padj = NA) + } + names(df) <- paste0(y, "_", names(df)) + df + })) + }) + + # Combine list elements to data frame + # and assign pathway names as rownames + res <- data.frame(do.call(rbind, res)) + + return(res) } @@ -145,53 +141,50 @@ extractPvalues <- function(enrichmentScores, pathwayNames) { #' #' @export combinePvalues <- function(df, method = "stouffer", weights = NULL) { - - method <- tolower(method) - if (!method %in% c("stouffer", "fisher", "edgington")) { - stop("You can chose between the 'stouffer', 'edgington', + method <- tolower(method) + if (!method %in% c("stouffer", "fisher", "edgington")) { + stop("You can chose between the 'stouffer', 'edgington', and 'fisher' method to combine p-values.", - call. = FALSE - ) - } - - cols <- grep("padj", colnames(df)) - - pvals <- apply(df, 1, function(row) { - row <- row[cols] - row <- row[!is.na(row)] + call. = FALSE + ) + } + + cols <- grep("padj", colnames(df)) + + pvals <- apply(df, 1, function(row) { + row <- row[cols] + row <- row[!is.na(row)] + + if (length(row) >= 2) { + if (method == "fisher") { + p <- metap::sumlog(row) + p$p + } else if (method == "edgington") { + p <- metap::sump(row) + p$p + } else { + ## sumz allows only p-values smaller than 1 + row <- row[row > 0 & row < 1] if (length(row) >= 2) { - if (method == "fisher") { - p <- sumlog(row) - p$p - } else if (method == "edgington") { - p <- sump(row) - p$p - } else { - - ## sumz allows only p-values smaller than 1 - row <- row[row > 0 & row < 1] - - if (length(row) >= 2) { - if (length(weights) > 0) { - p <- sumz(row, weights = weights) - } else { - p <- sumz(row) - } - p$p - } else if (length(row == 1)) { - row[1] - } else { - NA - } - } - } else if (length(row) == 1) { - row[1] + if (length(weights) > 0) { + p <- metap::sumz(row, weights = weights) + } else { + p <- metap::sumz(row) + } + p$p + } else if (length(row == 1)) { + row[1] } else { - NA + NA } - }) + } + } else if (length(row) == 1) { + row[1] + } else { + NA + } + }) - return(pvals) - + return(pvals) } diff --git a/R/feature_processing.R b/R/feature_processing.R index f4baa5e..6a0cadc 100644 --- a/R/feature_processing.R +++ b/R/feature_processing.R @@ -25,7 +25,7 @@ #' pathways <- graphite::pathways("mmusculus", "kegg")[[1]] #' getFeatures(pathways, which = "metabolites", org = "mmusculus", returntype = "HMDB") #' -#' pathways <- graphite::pathways("mmusculus", "kegg") +#' pathways <- graphite::pathways("mmusculus", "kegg")[[1]] #' getFeatures(pathways, which = "proteins", org = "mmusculus", returntype = "SYMBOL") #' } #' @@ -33,75 +33,73 @@ #' #' @export getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntype = "SYMBOL") { + if (which != "proteins" && which != "metabolites") { + stop("Only 'proteins' and 'metabolites' are supported for 'which'.", + call. = FALSE + ) + } - if (which != "proteins" && which != "metabolites") { - stop("Only 'proteins' and 'metabolites' are supported for 'which'.", - call. = FALSE - ) - } - - org <- tolower(org) + org <- tolower(org) - ## check for the correct organism - if (!(org %in% getOrganisms())) { - stop("Please insert a correct organism name! Use getOrganisms() + ## check for the correct organism + if (!(org %in% getOrganisms())) { + stop("Please insert a correct organism name! Use getOrganisms() to see all supported organisms.", - call. = FALSE - ) - } + call. = FALSE + ) + } - ## extract the features (genes/proteins or metabolites) from the pathway nodes. - features <- graphite::nodes(pathway, which = which) + ## extract the features (genes/proteins/metabolites) from the pathway nodes. + features <- graphite::nodes(pathway, which = which) - if (length(features) == 0) { - return(list()) - } + if (length(features) == 0) { + return(list()) + } - if (which == "proteins") { + if (which == "proteins") { + ## extract the keytype of the ID format + kt <- gsub(":.*", "", features[1]) + mapped <- gsub("[A-Z]+:", "", features) - ## extract the keytype of the ID format - kt <- gsub(":.*", "", features[1]) - mapped <- gsub("[A-Z]+:", "", features) + ## get the mapping from the keytype to the user-specific type + mapped <- getGeneMapping( + features = mapped, keytype = kt, + org = org, returntype = returntype + ) + } - ## get the mapping from the keytype to the user-specific type - mapped <- getGeneMapping( - features = mapped, keytype = kt, - org = org, returntype = returntype - ) - } + ## its special for metabolites, because sometimes there are different + ## identifiers used in the same pathway + if (which == "metabolites") { + chebi <- mapIDType( + features = features, keytype = "CHEBI", + maptype = "ChEBI", returntype = returntype + ) - ## its special for metabolites, because sometimes there are different - ## identifiers used in the same pathway - if (which == "metabolites") { - chebi <- mapIDType( - features = features, keytype = "CHEBI", - maptype = "ChEBI", returntype = returntype - ) + kegg <- mapIDType( + features = features, keytype = "KEGGCOMP", + maptype = "KEGG", returntype = returntype + ) - kegg <- mapIDType( - features = features, keytype = "KEGGCOMP", - maptype = "KEGG", returntype = returntype - ) + pubchem <- mapIDType( + features = features, keytype = "PUBCHEM", + maptype = "CID", returntype = returntype + ) - pubchem <- mapIDType( - features = features, keytype = "PUBCHEM", - maptype = "CID", returntype = returntype - ) + cas <- mapIDType( + features = features, keytype = "CAS", + maptype = "CAS", returntype = returntype + ) - cas <- mapIDType( - features = features, keytype = "CAS", - maptype = "CAS", returntype = returntype - ) - - hmdb <- mapIDType( - features = features, keytype = "HMDB", - maptype = "HMDB", returntype = returntype - ) + hmdb <- mapIDType( + features = features, keytype = "HMDB", + maptype = "HMDB", returntype = returntype + ) - mapped <- c(chebi, kegg, pubchem, cas, hmdb) - } + mapped <- c(chebi, kegg, pubchem, cas, hmdb) + } - return(mapped) + return(mapped) } @@ -136,9 +134,9 @@ getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntyp #' getGeneMapping(features, keytype = "UNIPROT", org = "rnorvegicus") #' #' getGeneMapping(features, -#' keytype = "UNIPROT", -#' org = "rnorvegicus", -#' returntype = "ENSEMBL" +#' keytype = "UNIPROT", +#' org = "rnorvegicus", +#' returntype = "ENSEMBL" #' ) #' } #' @@ -146,60 +144,59 @@ getFeatures <- function(pathway, which = "proteins", org = "hsapiens", returntyp #' #' @export getGeneMapping <- function(features, keytype, org = "hsapiens", returntype = "SYMBOL") { + org <- tolower(org) - org <- tolower(org) - - ## check for the correct organism - if (!(org %in% getOrganisms())) { - stop("Please insert a correct organism name! Use getOrganisms() + ## check for the correct organism + if (!(org %in% getOrganisms())) { + stop("Please insert a correct organism name! Use getOrganisms() to see all supported organisms.", - call. = FALSE - ) - } + call. = FALSE + ) + } - db <- getIDMappingDatabase(org) + db <- getIDMappingDatabase(org) - supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") - if (org != "dmelanogaster" && !returntype %in% supportedIDs) { - stop("Insert one of the following IDs to be returned (returntype): + supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") + if (org != "dmelanogaster" && !returntype %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returntype): SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ.", - call. = FALSE - ) - } + call. = FALSE + ) + } - supportedIDs <- c(supportedIDs, "FLYBASE", "FLYBASECG") - if (org == "dmelanogaster" && !returntype %in% supportedIDs) { - stop("Insert one of the following IDs to be returned (returntype): + supportedIDs <- c(supportedIDs, "FLYBASE", "FLYBASECG") + if (org == "dmelanogaster" && !returntype %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returntype): SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ, FLYBASE, FLYBASECG.", - call. = FALSE - ) - } - - ## design the columns field such that we create a triple ID mapping between - ## ENTREZIDs, UNIPROT, and gene symbols - if (keytype == "UNIPROT") { - col <- unique(c("SYMBOL", "ENTREZID", returntype)) - } - else { - col <- unique(c("SYMBOL", "UNIPROT", returntype)) - } - - ## run the actual mapping of IDS and return a list of the user-given type - map <- tryCatch( - { - map <- AnnotationDbi::select(db, keys = features, - columns = col, keytype = keytype) - m <- match(unique(map[[keytype]]), map[[keytype]]) - map <- map[m, ] - map[[returntype]][!is.na(map[[returntype]])] - }, - error = function(cond) { - return(list()) - } + call. = FALSE ) + } + + ## design the columns field such that we create a triple ID mapping between + ## ENTREZIDs, UNIPROT, and gene symbols + if (keytype == "UNIPROT") { + col <- unique(c("SYMBOL", "ENTREZID", returntype)) + } else { + col <- unique(c("SYMBOL", "UNIPROT", returntype)) + } - return(map) + ## run the actual mapping of IDS and return a list of the user-given type + map <- tryCatch( + { + map <- AnnotationDbi::select(db, + keys = features, + columns = col, keytype = keytype + ) + m <- match(unique(map[[keytype]]), map[[keytype]]) + map <- map[m, ] + map[[returntype]][!is.na(map[[returntype]])] + }, + error = function(cond) { + return(list()) + } + ) + return(map) } @@ -237,53 +234,51 @@ getGeneMapping <- function(features, keytype, org = "hsapiens", returntype = "SY #' @importFrom magrittr %>% #' #' @export -getMetaboliteMapping <- function( features, keytype, returntype = "HMDB") { - +getMetaboliteMapping <- function(features, keytype, returntype = "HMDB") { ## check for the correct metabolite mapping format - supportedIDs <- c("HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", - "DTXSID", "SID", "CID", "Drugbank") + supportedIDs <- c( + "HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", + "DTXSID", "SID", "CID", "Drugbank" + ) if (!returntype %in% supportedIDs) { - stop( "Insert one of the following IDs to be returned (returntype): + stop("Insert one of the following IDs to be returned (returntype): HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank, Name", - call. = FALSE + call. = FALSE ) } ## load the mapping table which is deposited in the ## metaboliteIDmapping package. - if (!requireNamespace( "metaboliteIDmapping", quietly = TRUE)) { - stop( "The necessary package metaboliteIDmapping is not installed.", - call. = FALSE - ) + if (!requireNamespace("metaboliteIDmapping", quietly = TRUE)) { + stop("The necessary package metaboliteIDmapping is not installed.", + call. = FALSE + ) } ## run the actual mapping of IDS and return a list of the user-given type map <- tryCatch( { - ## to speed up the mapping, we need to subest the whole ## metabolitesIDmapping table in the first place to contain ## only thoses entries that match the given feature list SUBmappingTable <- metaboliteIDmapping::metabolitesMapping %>% - dplyr::select( !!as.name( keytype), !!as.name( returntype)) %>% - dplyr::filter( !!as.name( keytype) %in% unique( features)) %>% + dplyr::select(!!as.name(keytype), !!as.name(returntype)) %>% + dplyr::filter(!!as.name(keytype) %in% unique(features)) %>% dplyr::distinct() - colnames( SUBmappingTable) <- c("Original", "Mapped") - - SUBmappingTable %>% dplyr::pull( Mapped) + colnames(SUBmappingTable) <- c("Original", "Mapped") + SUBmappingTable %>% dplyr::pull(Mapped) }, error = function(cond) { return( - rep( "NA", length( features)) + rep("NA", length(features)) ) } ) - map <- map[ !is.na(map)] + map <- map[!is.na(map)] return(map) - } @@ -322,22 +317,22 @@ getMetaboliteMapping <- function( features, keytype, returntype = "HMDB") { #' @examples #' #' getMultiOmicsFeatures( -#' dbs = c("kegg"), -#' layer = c("transcriptome", "proteome"), -#' organism = "hsapiens" +#' dbs = c("kegg"), +#' layer = c("transcriptome", "proteome"), +#' organism = "hsapiens" #' ) #' \donttest{ #' getMultiOmicsFeatures( -#' dbs = c("kegg", "reactome"), -#' layer = c("transcriptome", "metabolome"), -#' organism = "mmusculus" +#' dbs = c("kegg", "reactome"), +#' layer = c("transcriptome", "metabolome"), +#' organism = "mmusculus" #' ) #' #' getMultiOmicsFeatures( -#' dbs = c("reactome"), -#' layer = c("proteome"), -#' organism = "rnorvegicus", -#' returnProteome = "ENTREZID" +#' dbs = c("reactome"), +#' layer = c("proteome"), +#' organism = "rnorvegicus", +#' returnProteome = "ENTREZID" #' ) #' } #' @importFrom graphite pathwayDatabases pathways @@ -352,128 +347,125 @@ getMultiOmicsFeatures <- function(dbs = c("all"), layer = c("all"), returnMetabolome = "HMDB", organism = "hsapiens", useLocal = TRUE) { + layers <- c("all", "metabolome", "proteome", "transcriptome") + organism <- tolower(organism) - layers <- c("all", "metabolome", "proteome", "transcriptome") - organism <- tolower(organism) + returnTranscriptome <- toupper(returnTranscriptome) + returnProteome <- toupper(returnProteome) + returnMetabolome <- toupper(returnMetabolome) - returnTranscriptome <- toupper(returnTranscriptome) - returnProteome <- toupper(returnProteome) - returnMetabolome <- toupper(returnMetabolome) - - ## check for the correct transcriptome mapping format - supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") - if (!returnTranscriptome %in% supportedIDs) { - stop("Insert one of the following IDs to be returned (returnTranscriptome): + ## check for the correct transcriptome mapping format + supportedIDs <- c("SYMBOL", "ENTREZID", "UNIPROT", "ENSEMBL", "REFSEQ") + if (!returnTranscriptome %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returnTranscriptome): SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ", - call. = FALSE - ) - } + call. = FALSE + ) + } - ## check for the correct proteome mapping format - if (!returnProteome %in% supportedIDs) { - stop("Insert one of the following IDs to be returned (returnProteome): + ## check for the correct proteome mapping format + if (!returnProteome %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returnProteome): SYMBOL, ENTREZID, UNIPROT, ENSEMBL, REFSEQ", - call. = FALSE - ) - } + call. = FALSE + ) + } - ## check for the correct metabolite mapping format - supportedIDs <- c("HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", - "DTXSID", "SID", "CID", "Drugbank") - if (!returnMetabolome %in% supportedIDs) { - stop("Insert one of the following IDs to be returned (returnMetabolome): + ## check for the correct metabolite mapping format + supportedIDs <- c( + "HMDB", "ChEBI", "KEGG", "CAS", "DTXCID", + "DTXSID", "SID", "CID", "Drugbank" + ) + if (!returnMetabolome %in% supportedIDs) { + stop("Insert one of the following IDs to be returned (returnMetabolome): HMDB, CAS, ChEBI, KEGG, SID, CID, DTXCID, DTXSID, Drugbank", - call. = FALSE - ) - } + call. = FALSE + ) + } - ## check if the given organism is supported - if (!(organism %in% getOrganisms())) { - stop("You entered an organism that is not supported! + ## check if the given organism is supported + if (!(organism %in% getOrganisms())) { + stop("You entered an organism that is not supported! Use getOrganisms() to get a list of all suported organisms.", - call. = FALSE - ) - } + call. = FALSE + ) + } - if (sum(tolower(layer) %in% layers) != length(layer)) { - stop("You entered wrong input for the omics layer specification. + if (sum(tolower(layer) %in% layers) != length(layer)) { + stop("You entered wrong input for the omics layer specification. Options are: all, transcriptome, proteome, metabolome, or a combination thereof.", - call. = FALSE - ) - } + call. = FALSE + ) + } - pDBs <- graphite::pathwayDatabases() - dbs0 <- pDBs %>% - dplyr::filter(.data$species == organism) %>% - dplyr::pull(.data$database) - databases <- c("all", as.vector(dbs0)) + pDBs <- graphite::pathwayDatabases() + dbs0 <- pDBs %>% + dplyr::filter(.data$species == organism) %>% + dplyr::pull(.data$database) + databases <- c("all", as.vector(dbs0)) - if (sum(tolower(dbs) %in% databases) != length(dbs)) { - stop( paste0( "You entered wrong input for the omics layer specification. - Options are: ", paste( databases, collapse = ' '), " or a combination thereof."), - call. = FALSE - ) - } + if (sum(tolower(dbs) %in% databases) != length(dbs)) { + stop(paste0("You entered wrong input for the omics layer specification. + Options are: ", paste(databases, collapse = " "), " or a combination thereof."), + call. = FALSE + ) + } - if ("all" %in% dbs) dbs <- as.vector(dbs0) + if ("all" %in% dbs) dbs <- as.vector(dbs0) - if ("all" %in% layer) { - layer <- c("transcriptome", "proteome", "metabolome") - } + if ("all" %in% layer) { + layer <- c("transcriptome", "proteome", "metabolome") + } - pathways <- lapply(dbs, function(x) { - graphite::pathways(organism, x) - }) - names(pathways) <- dbs - - features <- list() - if ("transcriptome" %in% layer) { - features$transcriptome <- getMappedFeatures( - pathways = pathways, - organism = organism, - returnID = returnTranscriptome, - useLocal = useLocal - ) + pathways <- lapply(dbs, function(x) { + graphite::pathways(organism, x) + }) + names(pathways) <- dbs + + features <- list() + if ("transcriptome" %in% layer) { + features$transcriptome <- getMappedFeatures( + pathways = pathways, + organism = organism, + returnID = returnTranscriptome, + useLocal = useLocal + ) - ## adapt for duplicated pathways - names( features$transcriptome) <- rename_duplicates( names( features$transcriptome)) + ## adapt for duplicated pathways + names(features$transcriptome) <- rename_duplicates(names(features$transcriptome)) + } - } + if ("proteome" %in% layer) { + if ("transcriptome" %in% layer && returnProteome == returnTranscriptome) { + features$proteome <- features$transcriptome + } else { + features$proteome <- getMappedFeatures( + pathways = pathways, + organism = organism, + returnID = returnProteome, + useLocal = useLocal + ) - if ("proteome" %in% layer) { - if ("transcriptome" %in% layer && returnProteome == returnTranscriptome) { - features$proteome <- features$transcriptome - } else { - features$proteome <- getMappedFeatures( - pathways = pathways, - organism = organism, - returnID = returnProteome, - useLocal = useLocal - ) - - ## adapt for duplicated pathways - names( features$proteome) <- rename_duplicates( names( features$proteome)) - - } + ## adapt for duplicated pathways + names(features$proteome) <- rename_duplicates(names(features$proteome)) } + } - if ("metabolome" %in% layer) { - features$metabolome <- getMappedFeatures( - pathways = pathways, - organism = organism, - returnID = returnMetabolome, - which = "metabolites", - useLocal = useLocal - ) - - ## adapt for duplicated pathways - names( features$metabolome) <- rename_duplicates( names( features$metabolome)) - - } + if ("metabolome" %in% layer) { + features$metabolome <- getMappedFeatures( + pathways = pathways, + organism = organism, + returnID = returnMetabolome, + which = "metabolites", + useLocal = useLocal + ) - return(features) + ## adapt for duplicated pathways + names(features$metabolome) <- rename_duplicates(names(features$metabolome)) + } + return(features) } @@ -495,30 +487,28 @@ getMultiOmicsFeatures <- function(dbs = c("all"), layer = c("all"), #' #' @return List of mapped features for an omics layer. getMappedFeatures <- function(pathways, returnID = "SYMBOL", organism = "hsapiens", which = "proteins", useLocal = TRUE) { + feat <- unlist(lapply(names(pathways), function(db) { + ap <- archivePath(paste0(organism, "_", db, "_", returnID)) + + if (file.exists(ap) && useLocal) { + loadLocal(ap) + } else { + tmp <- lapply(pathways[[db]], function(p) { + getFeatures( + pathway = p, org = organism, + which = which, + returntype = returnID + ) + }) - feat <- unlist(lapply(names(pathways), function(db) { - ap <- archivePath(paste0(organism, "_", db, "_", returnID)) - - if (file.exists(ap) && useLocal) { - loadLocal(ap) - } else { - tmp <- lapply(pathways[[db]], function(p) { - getFeatures( - pathway = p, org = organism, - which = which, - returntype = returnID - ) - }) - - header <- rep(paste0("(", toupper(db), ") "), length(pathways[[db]])) - names(tmp) <- paste0(header, names(tmp)) - saveRDS(tmp, file = ap) - tmp - } - }), recursive = FALSE) - - return(feat) + header <- rep(paste0("(", toupper(db), ") "), length(pathways[[db]])) + names(tmp) <- paste0(header, names(tmp)) + saveRDS(tmp, file = ap) + tmp + } + }), recursive = FALSE) + return(feat) } @@ -535,22 +525,20 @@ getMappedFeatures <- function(pathways, returnID = "SYMBOL", organism = "hsapien #' #' @return List of mapped metabolite IDs. mapIDType <- function(features, keytype = "CHEBI", maptype = "ChEBI", returntype = "HMDB") { + mapped <- c() + ids <- gsub(paste0(keytype, ":"), "", features[grep(keytype, features)]) + + if (returntype != maptype) { + mapped <- getMetaboliteMapping( + features = ids, + keytype = maptype, + returntype = returntype + ) + } else { + mapped <- ids + } - mapped <- c() - ids <- gsub(paste0(keytype, ":"), "", features[grep(keytype, features)]) - - if (returntype != maptype) { - mapped <- getMetaboliteMapping( - features = ids, - keytype = maptype, - returntype = returntype - ) - } else { - mapped <- ids - } - - return(mapped) - + return(mapped) } @@ -569,15 +557,13 @@ mapIDType <- function(features, keytype = "CHEBI", maptype = "ChEBI", returntype #' getOrganisms() #' @export getOrganisms <- function() { + orglist <- c( + "hsapiens", "rnorvegicus", "mmusculus", "sscrofa", + "btaurus", "celegans", "dmelanogaster", "drerio", + "ggallus", "xlaevis", "cfamiliaris" + ) - orglist <- c( - "hsapiens", "rnorvegicus", "mmusculus", "sscrofa", - "btaurus", "celegans", "dmelanogaster", "drerio", - "ggallus", "xlaevis", "cfamiliaris" - ) - - return(orglist) - + return(orglist) } @@ -592,27 +578,25 @@ getOrganisms <- function() { #' #' @return AnnotationDbi database for ID mapping. getIDMappingDatabase <- function(organism) { + map <- c( + hsapiens = "org.Hs.eg.db", rnorvegicus = "org.Rn.eg.db", + mmusculus = "org.Mm.eg.db", sscrofa = "org.Ss.eg.db", + btaurus = "org.Bt.eg.db", celegans = "org.Ce.eg.db", + dmelanogaster = "org.Dm.eg.db", drerio = "org.Dr.eg.db", + ggallus = "org.Gg.eg.db", xlaevis = "org.Xl.eg.db", + cfamiliaris = "org.Cf.eg.db" + ) - map <- c( - hsapiens = "org.Hs.eg.db", rnorvegicus = "org.Rn.eg.db", - mmusculus = "org.Mm.eg.db", sscrofa = "org.Ss.eg.db", - btaurus = "org.Bt.eg.db", celegans = "org.Ce.eg.db", - dmelanogaster = "org.Dm.eg.db", drerio = "org.Dr.eg.db", - ggallus = "org.Gg.eg.db", xlaevis = "org.Xl.eg.db", - cfamiliaris = "org.Cf.eg.db" - ) - - stopifnot(organism %in% names(map)) - pkg <- map[[organism]] - - if (!requireNamespace(pkg, quietly = TRUE)) { - stop(paste0("The necessary package ", pkg, " is not installed."), - call. = FALSE - ) - } + stopifnot(organism %in% names(map)) + pkg <- map[[organism]] - return(get(pkg, envir = getNamespace(pkg))) + if (!requireNamespace(pkg, quietly = TRUE)) { + stop(paste0("The necessary package ", pkg, " is not installed."), + call. = FALSE + ) + } + return(get(pkg, envir = getNamespace(pkg))) } @@ -632,11 +616,10 @@ getIDMappingDatabase <- function(organism) { #' logFC <- rnorm(10) #' pvalues <- runif(10) #' rankFeatures(logFC, pvalues) +#' #' @export rankFeatures <- function(logFC, pvalues, base = 10) { - - return(sign(logFC) * -log(pvalues, base = base)) - + return(sign(logFC) * -log(pvalues, base = base)) } @@ -658,51 +641,46 @@ rankFeatures <- function(logFC, pvalues, base = 10) { #' initOmicsDataStructure(c("Transcriptome", "Metabolome")) #' @export initOmicsDataStructure <- function(layer = c("transcriptome", "proteome", "metabolome")) { - - l <- c() - layer <- tolower(layer) - if (length(grep("trans*", layer)) > 0) l <- c(l, "transcriptome") - if (length(grep("prote*", layer)) > 0) l <- c(l, "proteome") - if (length(grep("metabo*", layer)) > 0) l <- c(l, "metabolome") - - if (length(l) != length(layer)) { - stop("Not all your omics layer could be mapped to + l <- c() + layer <- tolower(layer) + if (length(grep("trans*", layer)) > 0) l <- c(l, "transcriptome") + if (length(grep("prote*", layer)) > 0) l <- c(l, "proteome") + if (length(grep("metabo*", layer)) > 0) l <- c(l, "metabolome") + + if (length(l) != length(layer)) { + stop("Not all your omics layer could be mapped to 'transcriptome', 'proteome', or 'metabolome'. ", - call. = TRUE - ) - } - - empty_structure <- rep(list(list()), length(l)) - names(empty_structure) <- l + call. = TRUE + ) + } - return(empty_structure) + empty_structure <- rep(list(list()), length(l)) + names(empty_structure) <- l + return(empty_structure) } #' Helper function to get all different metabolite ID formats #' -#' This helper function extracts all used ID formats in all pathways +#' This helper function extracts all used ID formats in all pathways #' and returns a nested list for each pathway database. #' #' @param pathways List of pathway databases and their pathway definition. #' #' @return List of metabolite ID formats. -getMetaboliteIDformats <- function(pathways){ - - n1 <- lapply(names(pathways), function( dbs){ - - n2 <- lapply(pathways[[dbs]], function(p){ - - unique(gsub( ":.*", "", graphite::nodes(p, which = "metabolites"))) - - }) - - unique(unlist(n2)) +#' +#' @importFrom graphite nodes +getMetaboliteIDformats <- function(pathways) { + n1 <- lapply(names(pathways), function(dbs) { + n2 <- lapply(pathways[[dbs]], function(p) { + unique(gsub(":.*", "", graphite::nodes(p, which = "metabolites"))) + }) + unique(unlist(n2)) }) - + names(n1) <- names(pathways) return(n1) } diff --git a/R/utility_functions.R b/R/utility_functions.R index 1c3c979..6d4972f 100644 --- a/R/utility_functions.R +++ b/R/utility_functions.R @@ -7,12 +7,10 @@ #' #' @return String containing the path to the file. archivePath <- function(filename) { - - ad <- archiveDir() - filename <- paste0( filename, ".rds") - - return( file.path( ad, filename, fsep = .Platform$file.sep)) - + ad <- archiveDir() + filename <- paste0(filename, ".rds") + + return(file.path(ad, filename, fsep = .Platform$file.sep)) } @@ -28,19 +26,17 @@ archivePath <- function(filename) { #' #' @importFrom rappdirs user_cache_dir archiveDir <- function() { - - ad <- rappdirs::user_cache_dir("multiGSEA") - - if (!file.exists(ad)) { - if (!dir.create(ad, FALSE, TRUE)) { - stop("An error occurred during creating the archive directory: ", ad, - call. = FALSE - ) - } + ad <- rappdirs::user_cache_dir("multiGSEA") + + if (!file.exists(ad)) { + if (!dir.create(ad, FALSE, TRUE)) { + stop("An error occurred during creating the archive directory: ", ad, + call. = FALSE + ) } + } - return(ad) - + return(ad) } @@ -49,7 +45,7 @@ archiveDir <- function() { #' Read a local RDS file. #' -#' Use the readRDS function to load read the given file which should be in RDS +#' Use the readRDS function to load the given file which should be in RDS #' format. #' #' @param filename Path to the file to be read. @@ -58,42 +54,38 @@ archiveDir <- function() { #' #' @importFrom methods is loadLocal <- function(filename) { - - res <- try(readRDS(filename), silent = TRUE) + res <- try(readRDS(filename), silent = TRUE) - if ("try-error" %in% is(res)) { - return(NULL) - } else { - return(res) - } - + if ("try-error" %in% methods::is(res)) { + return(NULL) + } else { + return(res) + } } #' Make a list of strings unique -#' +#' #' It might happen that there are duplicated strings in a list. With this #' function we will rename those duplicated entries in a way that we simply add #' the number of occurrences to the string. I.e., when the string foo occurs #' three times in a list, it will be renamed to foo_1, foo_2, and foo_3, #' respectively. -#' +#' #' @param names List of strings where duplicates should be renamed -#' +#' #' @return List where duplicates are renamed. -#' +#' #' @examples -#' l <- c( "foo", "bar", "foo", "bars") -#' rename_duplicates( l) -#' -#' @export -rename_duplicates <- function( names){ - - tn <- table( names) - for(name in names( tn[tn>1])){ - names[ names == name] <- paste0( name, "_", 1:tn[ names( tn) == name]) - } - - return( names) +#' l <- c("foo", "bar", "foo", "bars") +#' rename_duplicates(l) +#' +#' @export +rename_duplicates <- function(names) { + tn <- table(names) + for (name in names(tn[tn > 1])) { + names[names == name] <- paste0(name, "_", 1:tn[names(tn) == name]) + } + + return(names) } - diff --git a/man/extractPvalues.Rd b/man/extractPvalues.Rd index 42487e9..441f39d 100644 --- a/man/extractPvalues.Rd +++ b/man/extractPvalues.Rd @@ -38,7 +38,7 @@ names(ranks$proteome) <- proteome$Symbol es <- multiGSEA(pathways, ranks) extractPvalues( - enrichmentScores = es, - pathwayNames = names(pathways[[1]]) + enrichmentScores = es, + pathwayNames = names(pathways[[1]]) ) } diff --git a/man/getFeatures.Rd b/man/getFeatures.Rd index f8c9dc9..61416b6 100644 --- a/man/getFeatures.Rd +++ b/man/getFeatures.Rd @@ -42,7 +42,7 @@ getFeatures(pathways) pathways <- graphite::pathways("mmusculus", "kegg")[[1]] getFeatures(pathways, which = "metabolites", org = "mmusculus", returntype = "HMDB") -pathways <- graphite::pathways("mmusculus", "kegg") +pathways <- graphite::pathways("mmusculus", "kegg")[[1]] getFeatures(pathways, which = "proteins", org = "mmusculus", returntype = "SYMBOL") } diff --git a/man/getGeneMapping.Rd b/man/getGeneMapping.Rd index b6451b5..4236f28 100644 --- a/man/getGeneMapping.Rd +++ b/man/getGeneMapping.Rd @@ -40,9 +40,9 @@ features <- gsub("UNIPROT:", "", features) getGeneMapping(features, keytype = "UNIPROT", org = "rnorvegicus") getGeneMapping(features, - keytype = "UNIPROT", - org = "rnorvegicus", - returntype = "ENSEMBL" + keytype = "UNIPROT", + org = "rnorvegicus", + returntype = "ENSEMBL" ) } diff --git a/man/getMetaboliteIDformats.Rd b/man/getMetaboliteIDformats.Rd index 646f594..128f05a 100644 --- a/man/getMetaboliteIDformats.Rd +++ b/man/getMetaboliteIDformats.Rd @@ -13,6 +13,6 @@ getMetaboliteIDformats(pathways) List of metabolite ID formats. } \description{ -This helper function extracts all used ID formats in all pathways +This helper function extracts all used ID formats in all pathways and returns a nested list for each pathway database. } diff --git a/man/getMultiOmicsFeatures.Rd b/man/getMultiOmicsFeatures.Rd index ca986d7..d96952e 100644 --- a/man/getMultiOmicsFeatures.Rd +++ b/man/getMultiOmicsFeatures.Rd @@ -55,22 +55,22 @@ enrichment. \examples{ getMultiOmicsFeatures( - dbs = c("kegg"), - layer = c("transcriptome", "proteome"), - organism = "hsapiens" + dbs = c("kegg"), + layer = c("transcriptome", "proteome"), + organism = "hsapiens" ) \donttest{ getMultiOmicsFeatures( - dbs = c("kegg", "reactome"), - layer = c("transcriptome", "metabolome"), - organism = "mmusculus" + dbs = c("kegg", "reactome"), + layer = c("transcriptome", "metabolome"), + organism = "mmusculus" ) getMultiOmicsFeatures( - dbs = c("reactome"), - layer = c("proteome"), - organism = "rnorvegicus", - returnProteome = "ENTREZID" + dbs = c("reactome"), + layer = c("proteome"), + organism = "rnorvegicus", + returnProteome = "ENTREZID" ) } } diff --git a/man/loadLocal.Rd b/man/loadLocal.Rd index a5b5e4b..efa237d 100644 --- a/man/loadLocal.Rd +++ b/man/loadLocal.Rd @@ -13,6 +13,6 @@ loadLocal(filename) Content of file. } \description{ -Use the readRDS function to load read the given file which should be in RDS +Use the readRDS function to load the given file which should be in RDS format. } diff --git a/man/rankFeatures.Rd b/man/rankFeatures.Rd index cc3d568..01b6f79 100644 --- a/man/rankFeatures.Rd +++ b/man/rankFeatures.Rd @@ -24,4 +24,5 @@ implicated through their assigned p-value. logFC <- rnorm(10) pvalues <- runif(10) rankFeatures(logFC, pvalues) + } diff --git a/man/rename_duplicates.Rd b/man/rename_duplicates.Rd index 0c49e7e..e2cf8f3 100644 --- a/man/rename_duplicates.Rd +++ b/man/rename_duplicates.Rd @@ -20,7 +20,7 @@ three times in a list, it will be renamed to foo_1, foo_2, and foo_3, respectively. } \examples{ -l <- c( "foo", "bar", "foo", "bars") -rename_duplicates( l) +l <- c("foo", "bar", "foo", "bars") +rename_duplicates(l) } diff --git a/tests/testthat/test_combinePvalues.R b/tests/testthat/test_combinePvalues.R index 0819602..61429e8 100644 --- a/tests/testthat/test_combinePvalues.R +++ b/tests/testthat/test_combinePvalues.R @@ -1,55 +1,55 @@ context("P-value aggregation") test_that("method selection works", { - df <- cbind(runif(5), runif(5), runif(5)) - colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") + df <- cbind(runif(5), runif(5), runif(5)) + colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") - method <- "random" - expect_error(combinePvalues(df, method = method)) + method <- "random" + expect_error(combinePvalues(df, method = method)) }) test_that("Fisher method works", { - df <- cbind(runif(5), runif(5), runif(5)) - colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") + df <- cbind(runif(5), runif(5), runif(5)) + colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") - method <- "Fisher" - expect_is(combinePvalues(df, method = method), "numeric") - expect_equal(length(combinePvalues(df, method = method)), nrow(df)) + method <- "Fisher" + expect_is(combinePvalues(df, method = method), "numeric") + expect_equal(length(combinePvalues(df, method = method)), nrow(df)) }) test_that("Edgington method works", { - df <- cbind(runif(5), runif(5), runif(5)) - colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") + df <- cbind(runif(5), runif(5), runif(5)) + colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") - method <- "Edgington" - expect_is(combinePvalues(df, method = method), "numeric") - expect_equal(length(combinePvalues(df, method = method)), nrow(df)) + method <- "Edgington" + expect_is(combinePvalues(df, method = method), "numeric") + expect_equal(length(combinePvalues(df, method = method)), nrow(df)) }) test_that("Stouffer method works", { - df <- cbind(runif(5), runif(5), runif(5)) - colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") + df <- cbind(runif(5), runif(5), runif(5)) + colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") - method <- "Stouffer" - expect_is(combinePvalues(df, method = method), "numeric") - expect_equal(length(combinePvalues(df, method = method)), nrow(df)) + method <- "Stouffer" + expect_is(combinePvalues(df, method = method), "numeric") + expect_equal(length(combinePvalues(df, method = method)), nrow(df)) }) test_that("weighted Stouffer method works", { - df <- cbind(runif(5), runif(5), runif(5)) - colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") + df <- cbind(runif(5), runif(5), runif(5)) + colnames(df) <- c("trans.padj", "prot.padj", "meta.padj") - weights <- sample(1:100, 3, replace = TRUE) - expect_is(combinePvalues(df, weights = weights), "numeric") - expect_equal(length(combinePvalues(df)), nrow(df)) + weights <- sample(1:100, 3, replace = TRUE) + expect_is(combinePvalues(df, weights = weights), "numeric") + expect_equal(length(combinePvalues(df)), nrow(df)) - weights <- sample(1:100, 2, replace = TRUE) - expect_error(combinePvalues(df, weights = weights)) + weights <- sample(1:100, 2, replace = TRUE) + expect_error(combinePvalues(df, weights = weights)) - weights <- c(1, 1, 1) - expect_equal(combinePvalues(df), combinePvalues(df, weights = weights)) + weights <- c(1, 1, 1) + expect_equal(combinePvalues(df), combinePvalues(df, weights = weights)) }) diff --git a/tests/testthat/test_pathway_enrichment.R b/tests/testthat/test_pathway_enrichment.R index c147fdb..923ae71 100644 --- a/tests/testthat/test_pathway_enrichment.R +++ b/tests/testthat/test_pathway_enrichment.R @@ -19,51 +19,51 @@ names(rF_meta) <- gsub("HMDB", "HMDB00", names(rF_meta)) library(metaboliteIDmapping) test_that("provided data can be ranked", { - expect_is(rF_trans, "numeric") - expect_equal(length(rF_trans), nrow(transcriptome)) + expect_is(rF_trans, "numeric") + expect_equal(length(rF_trans), nrow(transcriptome)) - expect_is(rF_prot, "numeric") - expect_equal(length(rF_prot), nrow(proteome)) + expect_is(rF_prot, "numeric") + expect_equal(length(rF_prot), nrow(proteome)) - expect_is(rF_meta, "numeric") - expect_equal(length(rF_meta), nrow(metabolome)) + expect_is(rF_meta, "numeric") + expect_equal(length(rF_meta), nrow(metabolome)) }) test_that("pathway enrichment works.", { - dbs <- "kegg" - df <- getMultiOmicsFeatures(dbs = dbs) - - omics_data <- initOmicsDataStructure() - expect_is(omics_data, "list") - expect_equal(length(omics_data), 3) - expect_equivalent(names(omics_data), c("transcriptome", "proteome", "metabolome")) - - omics_data$transcriptome <- rF_trans - omics_data$proteome <- rF_prot - omics_data$metabolome <- rF_meta - - expect_warning((es <- multiGSEA(df, omics_data))) - expect_equal(length(es), length(omics_data)) - expect_true("pathway" %in% colnames(es$transcriptome)) - expect_true("pval" %in% colnames(es$transcriptome)) - expect_true("padj" %in% colnames(es$transcriptome)) - - expect_true("pathway" %in% colnames(es$proteome)) - expect_true("pval" %in% colnames(es$proteome)) - expect_true("padj" %in% colnames(es$proteome)) - - expect_true("pathway" %in% colnames(es$metabolome)) - expect_true("pval" %in% colnames(es$metabolome)) - expect_true("padj" %in% colnames(es$metabolome)) - - eP <- extractPvalues( - enrichmentScores = es, - pathwayNames = names(df[[1]]) - ) - - expect_equal(ncol(eP), 2 * length(omics_data)) - expect_equal(nrow(eP), length(df$transcriptome)) - - expect_lte(max(eP, na.rm = TRUE), 1) - expect_gte(min(eP, na.rm = TRUE), 0) + dbs <- "kegg" + df <- getMultiOmicsFeatures(dbs = dbs) + + omics_data <- initOmicsDataStructure() + expect_is(omics_data, "list") + expect_equal(length(omics_data), 3) + expect_equivalent(names(omics_data), c("transcriptome", "proteome", "metabolome")) + + omics_data$transcriptome <- rF_trans + omics_data$proteome <- rF_prot + omics_data$metabolome <- rF_meta + + expect_warning((es <- multiGSEA(df, omics_data))) + expect_equal(length(es), length(omics_data)) + expect_true("pathway" %in% colnames(es$transcriptome)) + expect_true("pval" %in% colnames(es$transcriptome)) + expect_true("padj" %in% colnames(es$transcriptome)) + + expect_true("pathway" %in% colnames(es$proteome)) + expect_true("pval" %in% colnames(es$proteome)) + expect_true("padj" %in% colnames(es$proteome)) + + expect_true("pathway" %in% colnames(es$metabolome)) + expect_true("pval" %in% colnames(es$metabolome)) + expect_true("padj" %in% colnames(es$metabolome)) + + eP <- extractPvalues( + enrichmentScores = es, + pathwayNames = names(df[[1]]) + ) + + expect_equal(ncol(eP), 2 * length(omics_data)) + expect_equal(nrow(eP), length(df$transcriptome)) + + expect_lte(max(eP, na.rm = TRUE), 1) + expect_gte(min(eP, na.rm = TRUE), 0) }) diff --git a/tests/testthat/test_pathway_features.R b/tests/testthat/test_pathway_features.R index 013831f..cfb1244 100644 --- a/tests/testthat/test_pathway_features.R +++ b/tests/testthat/test_pathway_features.R @@ -1,50 +1,50 @@ context("Pathways and feature extraction") test_that("transcriptomic features get mapped", { - layer <- c("transcriptome") - dbs <- c("kegg") + layer <- c("transcriptome") + dbs <- c("kegg") - df <- getMultiOmicsFeatures(dbs = dbs, layer = layer) + df <- getMultiOmicsFeatures(dbs = dbs, layer = layer) - expect_is(df, "list") - expect_equal(length(df), 1) - expect_identical(names(df), c("transcriptome")) + expect_is(df, "list") + expect_equal(length(df), 1) + expect_identical(names(df), c("transcriptome")) - expect_match(names(df$transcriptome), "^\\(KEGG\\)") + expect_match(names(df$transcriptome), "^\\(KEGG\\)") }) test_that("proteomic features get mapped", { - layer <- c("proteome") - dbs <- c("kegg") + layer <- c("proteome") + dbs <- c("kegg") - df <- getMultiOmicsFeatures(dbs = dbs, layer = layer) + df <- getMultiOmicsFeatures(dbs = dbs, layer = layer) - expect_is(df, "list") - expect_equal(length(df), 1) - expect_identical(names(df), c("proteome")) + expect_is(df, "list") + expect_equal(length(df), 1) + expect_identical(names(df), c("proteome")) - expect_match(names(df$proteome), "^\\(KEGG\\)") + expect_match(names(df$proteome), "^\\(KEGG\\)") }) test_that("metabolomic features get mapped", { - layer <- c("metabolome") - dbs <- c("kegg") + layer <- c("metabolome") + dbs <- c("kegg") - df <- getMultiOmicsFeatures(dbs = dbs, layer = layer) + df <- getMultiOmicsFeatures(dbs = dbs, layer = layer) - expect_is(df, "list") - expect_equal(length(df), 1) - expect_identical(names(df), c("metabolome")) + expect_is(df, "list") + expect_equal(length(df), 1) + expect_identical(names(df), c("metabolome")) - expect_match(names(df$metabolome), "^\\(KEGG\\)") + expect_match(names(df$metabolome), "^\\(KEGG\\)") }) test_that("each layer has equal length", { - dbs <- c("kegg") - df <- getMultiOmicsFeatures(dbs = dbs, organism = "hsapiens") + dbs <- c("kegg") + df <- getMultiOmicsFeatures(dbs = dbs, organism = "hsapiens") - expect_equal(length(df), 3) - expect_equal(length(df$transcriptome), length(df$proteome)) - expect_equal(length(df$transcriptome), length(df$metabolome)) + expect_equal(length(df), 3) + expect_equal(length(df$transcriptome), length(df$proteome)) + expect_equal(length(df$transcriptome), length(df$metabolome)) }) diff --git a/vignettes/multiGSEA.rmd b/vignettes/multiGSEA.rmd index 9af0954..cd12220 100644 --- a/vignettes/multiGSEA.rmd +++ b/vignettes/multiGSEA.rmd @@ -59,15 +59,14 @@ Hence, the best way to install the package is via `BiocManager`. Start R (>=4.0.0) and run the following commands: ```{r bioconductor, eval=FALSE} - -if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager") +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} # The following initializes usage of Bioc devel -BiocManager::install(version='devel') +BiocManager::install(version = "devel") BiocManager::install("multiGSEA") - ``` Alternatively, the `multiGSEA` package is hosted on our github page @@ -75,10 +74,8 @@ https://github.com/yigbt/multiGSEA and can be installed via the `devtools` package: ```{r devtools, eval=FALSE} - install.packages("devtools") devtools::install_github("https://github.com/yigbt/multiGSEA") - ``` Once installed, just load the `multiGSEA` package with: @@ -111,7 +108,7 @@ omics data was measured in mouse or rat, we would need the packages respectively. ```{r load_mapping_library, results='hide', warning=FALSE, message=FALSE} -library( "org.Hs.eg.db") +library("org.Hs.eg.db") ``` In principle, `multiGSEA` is able to deal with 11 different model organisms. @@ -121,30 +118,37 @@ their respective `AnnotationDbi` package is shown in Table ```{r organismsTable, results='asis', echo=FALSE} caption <- "Supported organisms, their abbreviations being used in `multiGSEA`, - and mapping database that are needed for feature conversion. + and mapping database that are needed for feature conversion. Supported abbreviations can be seen with `getOrganisms()`" -df <- data.frame( Organisms = c( "Human", "Mouse", "Rat", "Dog", "Cow", "Pig", - "Chicken", "Zebrafish", "Frog", "Fruit Fly", - "C.elegans"), - Abbreviations = c( "hsapiens", "mmusculus", "rnorvegicus", - "cfamiliaris", "btaurus", "sscrofa", - "ggallus", "drerio", "xlaevis", - "dmelanogaster", "celegans"), - Mapping = c( "org.Hs.eg.db", "org.Mm.eg.db", "org.Rn.eg.db", - "org.Cf.eg.db", "org.Bt.eg.db", "org.Ss.eg.db", - "org.Gg.eg.db", "org.Xl.eg.db", "org.Dr.eg.db", - "org.Dm.eg.db", "org.Ce.eg.db")) - -knitr::kable( df, caption = caption) +df <- data.frame( + Organisms = c( + "Human", "Mouse", "Rat", "Dog", "Cow", "Pig", + "Chicken", "Zebrafish", "Frog", "Fruit Fly", + "C.elegans" + ), + Abbreviations = c( + "hsapiens", "mmusculus", "rnorvegicus", + "cfamiliaris", "btaurus", "sscrofa", + "ggallus", "drerio", "xlaevis", + "dmelanogaster", "celegans" + ), + Mapping = c( + "org.Hs.eg.db", "org.Mm.eg.db", "org.Rn.eg.db", + "org.Cf.eg.db", "org.Bt.eg.db", "org.Ss.eg.db", + "org.Gg.eg.db", "org.Xl.eg.db", "org.Dr.eg.db", + "org.Dm.eg.db", "org.Ce.eg.db" + ) +) +knitr::kable(df, caption = caption) ``` To run the analysis of this vignette, load the installed version of `multiGSEA`. ```{r load_multiGSEA_package, results='hide', message=FALSE, warning=FALSE} -library( multiGSEA) -library( magrittr) +library(multiGSEA) +library(magrittr) ``` Load the omics data for each layer where an enrichment should be calculated. @@ -152,16 +156,14 @@ The example data is provided within the package and already preprocessed such that we have log2 transformed fold changes and their associated p-values. ```{r load_omics_data} - # load transcriptomic features -data( transcriptome) +data(transcriptome) # load proteomic features -data( proteome) +data(proteome) # load metabolomic features -data( metabolome) - +data(metabolome) ``` @@ -186,14 +188,14 @@ The header of the data frame can be seen in Table \@ref(tab:omicsTable): ```{r omicsTable, results='asis', echo=FALSE} caption <- "Structure of the necessary omics data. For each layer - (here: transcriptome), feature IDs, log2FCs, and p-values + (here: transcriptome), feature IDs, log2FCs, and p-values are needed." -knitr::kable( - transcriptome %>% - dplyr::arrange( Symbol) %>% - dplyr::slice( 1:6), - caption = caption +knitr::kable( + transcriptome %>% + dplyr::arrange(Symbol) %>% + dplyr::slice(1:6), + caption = caption ) ``` @@ -219,38 +221,41 @@ shown by Zyla _et al._ [@Zyla:2017], the choice of the applied metric might have a big impact on the outcome of your analysis. ```{r rank_features, results='hide'} - # create data structure -omics_data <- initOmicsDataStructure( layer = c("transcriptome", - "proteome", - "metabolome")) +omics_data <- initOmicsDataStructure(layer = c( + "transcriptome", + "proteome", + "metabolome" +)) ## add transcriptome layer -omics_data$transcriptome <- rankFeatures( transcriptome$logFC, - transcriptome$pValue) -names( omics_data$transcriptome) <- transcriptome$Symbol +omics_data$transcriptome <- rankFeatures( + transcriptome$logFC, + transcriptome$pValue +) +names(omics_data$transcriptome) <- transcriptome$Symbol ## add proteome layer omics_data$proteome <- rankFeatures(proteome$logFC, proteome$pValue) -names( omics_data$proteome) <- proteome$Symbol +names(omics_data$proteome) <- proteome$Symbol ## add metabolome layer ## HMDB features have to be updated to the new HMDB format omics_data$metabolome <- rankFeatures(metabolome$logFC, metabolome$pValue) -names( omics_data$metabolome) <- metabolome$HMDB -names( omics_data$metabolome) <- gsub( "HMDB", "HMDB00", - names( omics_data$metabolome)) - +names(omics_data$metabolome) <- metabolome$HMDB +names(omics_data$metabolome) <- gsub( + "HMDB", "HMDB00", + names(omics_data$metabolome) +) ``` The first elements of each omics layer are shown below: ```{r omics_short} - -omics_short <- lapply( names( omics_data), function( name){ - head( omics_data[[name]]) - }) -names( omics_short) <- names( omics_data) +omics_short <- lapply(names(omics_data), function(name) { + head(omics_data[[name]]) +}) +names(omics_short) <- names(omics_data) omics_short ``` @@ -261,25 +266,25 @@ layer we are interested in before pathway definitions are downloaded and features are mapped to the desired format. ```{r calculate_enrichment, results='hide', message=FALSE, warning=FALSE} - -databases <- c( "kegg", "reactome") -layers <- names( omics_data) - -pathways <- getMultiOmicsFeatures( dbs = databases, layer = layers, - returnTranscriptome = "SYMBOL", - returnProteome = "SYMBOL", - returnMetabolome = "HMDB", - useLocal = FALSE) +databases <- c("kegg", "reactome") +layers <- names(omics_data) + +pathways <- getMultiOmicsFeatures( + dbs = databases, layer = layers, + returnTranscriptome = "SYMBOL", + returnProteome = "SYMBOL", + returnMetabolome = "HMDB", + useLocal = FALSE +) ``` The first two pathway definitions of each omics layer are shown below: ```{r pathways_short} - -pathways_short <- lapply( names( pathways), function( name){ - head( pathways[[name]], 2) - }) -names( pathways_short) <- names( pathways) +pathways_short <- lapply(names(pathways), function(name) { + head(pathways[[name]], 2) +}) +names(pathways_short) <- names(pathways) pathways_short ``` @@ -293,11 +298,9 @@ step we can use the `multiGSEA` function to calculate the enrichment for all omics layer separately. ```{r run_enrichment, results='hide', message=FALSE, warning=FALSE} - # use the multiGSEA function to calculate the enrichment scores # for all omics layer at once. -enrichment_scores <- multiGSEA( pathways, omics_data) - +enrichment_scores <- multiGSEA(pathways, omics_data) ``` The enrichment score data structure is a list containing sublists named @@ -324,15 +327,15 @@ opposite direction by favoring large p-values [@Edgington:1972]. Those methods can be applied by setting the parameter `method` to "fisher" or "edgington". ```{r combine_pvalues} +df <- extractPvalues( + enrichmentScores = enrichment_scores, + pathwayNames = names(pathways[[1]]) +) -df <- extractPvalues( enrichmentScores = enrichment_scores, - pathwayNames = names( pathways[[1]])) - -df$combined_pval <- combinePvalues( df) -df$combined_padj <- p.adjust( df$combined_pval, method = "BH") - -df <- cbind( data.frame( pathway = names( pathways[[1]])), df) +df$combined_pval <- combinePvalues(df) +df$combined_padj <- p.adjust(df$combined_pval, method = "BH") +df <- cbind(data.frame(pathway = names(pathways[[1]])), df) ``` Finally, print the pathways sorted based on their combined adjusted p-values. @@ -342,17 +345,17 @@ For displaying reasons, only the adjusted p-values are shown in Table ```{r resultTable, results='asis', echo=FALSE} caption <- "Table summarizing the top 15 pathways where we can calculate an enrichment for all three layers . Pathways from KEGG, Reactome, - and BioCarta are listed based on their aggregated adjusted p-value. - Corrected p-values are displayed for each omics layer separately and + and BioCarta are listed based on their aggregated adjusted p-value. + Corrected p-values are displayed for each omics layer separately and aggregated by means of the Stouffer's Z-method." -knitr::kable( - df %>% - dplyr::arrange( combined_padj) %>% - dplyr::filter( !is.na(metabolome_padj) ) %>% - dplyr::select( c( pathway, transcriptome_padj, proteome_padj, metabolome_padj, combined_pval)) %>% - dplyr::slice( 1:15), - caption = caption +knitr::kable( + df %>% + dplyr::arrange(combined_padj) %>% + dplyr::filter(!is.na(metabolome_padj)) %>% + dplyr::select(c(pathway, transcriptome_padj, proteome_padj, metabolome_padj, combined_pval)) %>% + dplyr::slice(1:15), + caption = caption ) ``` @@ -368,17 +371,15 @@ In the following example, HALLMARK gene sets are retrieved from `MSigDB` and used to create a transcriptomics input list: ```{r msigdbr, eval=FALSE} - library(msigdbr) library(dplyr) hallmark <- msigdbr(species = "Rattus norvegicus", category = "H") %>% - dplyr::select( gs_name, ensembl_gene) %>% - dplyr::filter( !is.na(ensembl_gene)) %>% - unstack( ensembl_gene ~ gs_name) - -pathways <- list( "transcriptome" = hallmark) + dplyr::select(gs_name, ensembl_gene) %>% + dplyr::filter(!is.na(ensembl_gene)) %>% + unstack(ensembl_gene ~ gs_name) +pathways <- list("transcriptome" = hallmark) ``` **Please note**, feature sets across multiple omics layers have to be in the From fdf104407852872db9cc91ef171cbeb609aacdd2 Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Thu, 27 Apr 2023 09:07:50 +0200 Subject: [PATCH 13/15] Version bump in devel. --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 269f693..fbe042e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.11.0 +Version: 1.11.1 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")), From 487f1d4ff001adaf2dbd140f8bb8351130c8a008 Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Thu, 27 Apr 2023 09:59:12 +0200 Subject: [PATCH 14/15] Re-apply styler, escape colname in tibble. --- R/enrichment_functions.R | 2 +- R/feature_processing.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/enrichment_functions.R b/R/enrichment_functions.R index 2eb55bf..1321fe6 100644 --- a/R/enrichment_functions.R +++ b/R/enrichment_functions.R @@ -96,7 +96,7 @@ extractPvalues <- function(enrichmentScores, pathwayNames) { })) }) - # Combine list elements to data frame + # Combine list elements to data frame # and assign pathway names as rownames res <- data.frame(do.call(rbind, res)) diff --git a/R/feature_processing.R b/R/feature_processing.R index 6a0cadc..98a31b7 100644 --- a/R/feature_processing.R +++ b/R/feature_processing.R @@ -268,7 +268,7 @@ getMetaboliteMapping <- function(features, keytype, returntype = "HMDB") { dplyr::distinct() colnames(SUBmappingTable) <- c("Original", "Mapped") - SUBmappingTable %>% dplyr::pull(Mapped) + SUBmappingTable %>% dplyr::pull("Mapped") }, error = function(cond) { return( @@ -616,7 +616,7 @@ getIDMappingDatabase <- function(organism) { #' logFC <- rnorm(10) #' pvalues <- runif(10) #' rankFeatures(logFC, pvalues) -#' +#' #' @export rankFeatures <- function(logFC, pvalues, base = 10) { return(sign(logFC) * -log(pvalues, base = base)) @@ -670,7 +670,7 @@ initOmicsDataStructure <- function(layer = c("transcriptome", "proteome", "metab #' @param pathways List of pathway databases and their pathway definition. #' #' @return List of metabolite ID formats. -#' +#' #' @importFrom graphite nodes getMetaboliteIDformats <- function(pathways) { n1 <- lapply(names(pathways), function(dbs) { From cb8d1a1ffd15011109d46b7bb9321aa7440d2118 Mon Sep 17 00:00:00 2001 From: Sebastian Canzler Date: Thu, 27 Apr 2023 10:00:12 +0200 Subject: [PATCH 15/15] Version bump in devel. --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index fbe042e..5563862 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: multiGSEA Type: Package Title: Combining GSEA-based pathway enrichment with multi omics data integration -Version: 1.11.1 +Version: 1.11.2 Date: 2020-03-05 Authors@R: c( person( "Sebastian", "Canzler", email = "sebastian.canzler@ufz.de", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7935-9582")),