From 81723a2f22f938ecb93323ad65bd1baf8963cae1 Mon Sep 17 00:00:00 2001 From: Geo Pertea Date: Mon, 16 Sep 2024 10:09:16 -0400 Subject: [PATCH] simplify/improve ENSEMBL base transcript name matching --- .gitignore | 1 + R/getDegTx.R | 33 ++++++++++++++++++++------------- R/utils.R | 51 ++++++++++++--------------------------------------- 3 files changed, 33 insertions(+), 52 deletions(-) diff --git a/.gitignore b/.gitignore index 8b78a3a..1fb1ed6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ .Rdata .httr-oauth .DS_Store +dcs04_data inst/doc diff --git a/R/getDegTx.R b/R/getDegTx.R index 9a65b17..6c7cc2f 100644 --- a/R/getDegTx.R +++ b/R/getDegTx.R @@ -31,28 +31,35 @@ #' @import rlang #' #' @examples -#' getDegTx(covComb_tx_deg) -#' stopifnot(mean(rowMeans(assays(covComb_tx_deg)$tpm)) > 1) -getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = select_transcripts(type), assayname = "tpm") { - - type = arg_match(type) - +#' degTx <- getDegTx(rse_tx, "standard") +getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = NULL, assayname = "tpm") { + + #type = arg_match(type) + if (is.null(sig_transcripts)) { + type = arg_match(type) + sig_transcripts <- select_transcripts(type) + } else { + type = "custom" + } # Validate rse_tx is a RangedSummarizedExperiment object if (!is(rse_tx, "RangedSummarizedExperiment")) { stop("'rse_tx' must be a RangedSummarizedExperiment object.", call. = FALSE) } - + # Check if assayname is in assayNames if (!assayname %in% assayNames(rse_tx)) { stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE) } - + # Check for validity and matching of tx names - sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts') - - # Subset rse_tx to include sig_transcripts - rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE] - + wtx <- check_tx_names(rownames(rse_tx), sig_transcripts) + if (length(wtx) < 10) { + stop("Not enough transcript names were found in the '",type, "' degradation model transcripts" ) + } + # if (verbose) + message(" '",type,"' degradation model transcripts found: ", length(wtx)) + rse_tx <- rse_tx[wtx, , drop = FALSE] + # Check if the row means is greater than 1 if (mean(rowMeans(assays(rse_tx)[[assayname]])) < 1) { warning("The transcripts selected are lowly expressed in your dataset. This can impact downstream analysis.") diff --git a/R/utils.R b/R/utils.R index 57cc94b..fbd70cc 100644 --- a/R/utils.R +++ b/R/utils.R @@ -14,46 +14,19 @@ #' @export #' #' @examples -#' sig_transcripts = select_transcripts("cell_component") -#' check_tx_names(rownames(covComb_tx_deg), sig_transcripts, 'rownames(covComb_tx_deg)', 'sig_transcripts') +#' sig_tx <- select_transcripts("cell_component") +#' whichTx <- check_tx_names(rownames(rse_tx), sig_tx) - -check_tx_names = function(tx1, tx2, arg_name1, arg_name2) { +check_tx_names = function(txnames, sig_transcripts) { # Functions for checking whether a vector of transcripts all match GENCODE # or ENSEMBL naming conventions - is_gencode = function(x) all(grepl("^ENST.*?\\.", x)) - is_ensembl = function(x) all(grepl("^ENST", x) & !grepl("\\.", x)) - - # Check that both vectors either follow GENCODE or ENSEMBL - if (!is_gencode(tx1) && !is_ensembl(tx1)) { - stop( - sprintf( - "'%s' must use either all GENCODE or all ENSEMBL transcript IDs", - arg_name1 - ) - ) - } - if (!is_gencode(tx2) && !is_ensembl(tx2)) { - stop( - sprintf( - "'%s' must use either all GENCODE or all ENSEMBL transcript IDs", - arg_name2 - ) - ) - } - - # Change 'tx2' to match 'tx1', noting that the case where 'tx1' is GENCODE - # but 'tx2' is ENSEMBL is not allowed (and an error will be thrown further - # down) - if (is_gencode(tx2) && is_ensembl(tx1)) { - tx2 = sub('\\..*', '', tx2) - } - - # At least some transcripts must overlap between 'tx1' and 'tx2' - if (!any(tx2 %in% tx1)) { - stop(sprintf("None of '%s' are in '%s'", arg_name2, arg_name1)) + ## Between releases 25 and 43, PAR genes and transcripts had the "_PAR_Y" suffix appended to their identifiers. + ## Since release 44, these have their own IDs + if (!all(grepl("^ENST\\d+", txnames))) { + stop("The transcript names must be ENSEMBL or Gencode IDs (ENST...)" ) } - - # Since only 'tx2' was modified, return the changed copy - return(tx2) -} \ No newline at end of file + ## normalize the transcript names + sig_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', sig_transcripts, perl=TRUE) + r_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE) + which(r_tx %in% sig_tx) +}