diff --git a/.Rbuildignore b/.Rbuildignore index c2c230f..72cab72 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,8 @@ ^\.github$ ^data-raw$ ^codecov\.yml$ +^\.Rhistory$ +^\.Rdata$ +^\.httr-oauth$ +^\.DS_Store$ +^dcs04_data$ diff --git a/DESCRIPTION b/DESCRIPTION index 1f9f199..9d4781e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: qsvaR Title: Generate Quality Surrogate Variable Analysis for Degradation Correction -Version: 1.9.0 -Date: 2024-05-03 +Version: 1.9.1 +Date: 2024-09-16 Authors@R: c( person("Joshua", "Stolz", email = "jstolz80@gmail.com", @@ -25,7 +25,7 @@ biocViews: Software, WorkflowStep, Normalization, BiologicalQuestion, DifferentialExpression, Sequencing, Coverage Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.0 +RoxygenNote: 7.3.2 Suggests: BiocFileCache, BiocStyle, diff --git a/NAMESPACE b/NAMESPACE index f492de8..efd1f82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,13 +1,14 @@ # Generated by roxygen2: do not edit by hand export(DEqual) -export(check_tx_names) export(getDegTx) export(getPCs) export(get_qsvs) export(k_qsvs) +export(normalize_tx_names) export(qSVA) export(select_transcripts) +export(which_tx_names) import(SummarizedExperiment) import(ggplot2) import(rlang) diff --git a/R/DEqual.R b/R/DEqual.R index dd535d2..596062b 100644 --- a/R/DEqual.R +++ b/R/DEqual.R @@ -43,33 +43,34 @@ DEqual <- function(DE) { ## Check input # stopifnot("t" %in% colnames(DE)) # stopifnot(!is.null(rownames(DE))) - + # Check if input is a dataframe - if (!is.data.frame(DE)) { - stop("The input to DEqual is not a dataframe.", call. = FALSE) + if (!is.data.frame(DE)) { + stop("The input to DEqual is not a dataframe.", call. = FALSE) } # Check if 't' is in the column names of DE if (!("t" %in% colnames(DE))) { stop("'t' is not a column in 'DE'.", call. = FALSE) } - + # Check if DE has non-null row names if (is.null(rownames(DE))) { stop("Row names of 'DE' are NULL.", call. = FALSE) } - + ## Locate common transcripts deg_tstats = qsvaR::degradation_tstats - rownames(deg_tstats) = check_tx_names(rownames(DE),rownames(qsvaR::degradation_tstats),'rownames(DE)','qsvaR::degradation_tstats') - common = intersect(rownames(deg_tstats), rownames(DE)) - + whichTx <- which_tx_names(rownames(DE),rownames(deg_tstats)) + #rownames(deg_tstats) = check_tx_names(rownames(DE),rownames(qsvaR::degradation_tstats),'rownames(DE)','qsvaR::degradation_tstats') + #common = intersect(rownames(deg_tstats), rownames(DE)) + common = qsvaR::normalize_tx_names(rownames(DE)[whichTx]) stopifnot(length(common) > 0) - + rownames(deg_tstats) <- qsvaR::normalize_tx_names(rownames(deg_tstats)) ## Create dataframe with common transcripts common_data <- data.frame( - degradation_t = deg_tstats$t[match(common, rownames(deg_tstats))], - DE_t = DE$t[match(common, rownames(DE))] + degradation_t = deg_tstats[common, ]$t, + DE_t = DE[whichTx, ]$t ) p <- ggplot(common_data, aes(x = DE_t, y = degradation_t)) + xlab("DE t-statistic") + diff --git a/R/getDegTx.R b/R/getDegTx.R index 6c7cc2f..923c25f 100644 --- a/R/getDegTx.R +++ b/R/getDegTx.R @@ -51,8 +51,8 @@ getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE) } - # Check for validity and matching of tx names - wtx <- check_tx_names(rownames(rse_tx), sig_transcripts) + # Check for validity and matching of tx names and return the tx subset indexes in rse_tx + wtx <- which_tx_names(rownames(rse_tx), sig_transcripts) if (length(wtx) < 10) { stop("Not enough transcript names were found in the '",type, "' degradation model transcripts" ) } diff --git a/R/utils.R b/R/utils.R index fbd70cc..8a44fc8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,32 +1,53 @@ -#' Check validity of transcript vectors + + + +#' Remove version number from Gencode/Ensembl transcript names #' -#' This function is used to check if the tx1 and tx2 are GENCODE or ENSEMBL and print an error message if it's not and return a character vector of transcripts in tx2 that are in tx1. +#' This function removes the Gencode/ENSEMBL version from the transcript ID, while protecting _PAR_Y suffixes if present #' -#' @param tx1 A `character()` vector of GENCODE or ENSEMBL transcripts. -#' @param tx2 A `character()` vector of GENCODE or ENSEMBL transcripts. +#' @param txnames A `character()` vector of GENCODE or ENSEMBL transcript IDs #' -#' @param arg_name1 A `character(1)` vector of description of tx1 -#' @param arg_name2 A `character(1)` vector of description of tx2 #' #' @return A -#' `character()` vector of transcripts in `tx2` that are in `tx1`. +#' `character()` vector of transcript names without versioning +#' +#' @export +#' +#' @examples +#' ensIDs <- normalize_tx_names(rownames(rse_tx)) + +normalize_tx_names <- function(txnames) { + sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE) +} + + +#' Check validity of transcript vectors and return a vector matching indexes in tx1 +#' +#' This function is used to check if tx1 and tx2 are GENCODE or ENSEMBL transcript IDs +#' and return an integer vector of tx1 transcript indexes that are in tx2. +#' +#' @param tx1 A `character()` vector of GENCODE or ENSEMBL transcript IDs. +#' @param tx2 A `character()` vector of GENCODE or ENSEMBL transcript IDs. +#' +#' +#' @return A +#' `integer()` vector of `tx1` transcript indexes in `tx2`. #' #' @export #' #' @examples #' sig_tx <- select_transcripts("cell_component") -#' whichTx <- check_tx_names(rownames(rse_tx), sig_tx) +#' whichTx <- which_tx_names(rownames(rse_tx), sig_tx) -check_tx_names = function(txnames, sig_transcripts) { - # Functions for checking whether a vector of transcripts all match GENCODE - # or ENSEMBL naming conventions +which_tx_names = function(txnames, sig_transcripts) { ## Between releases 25 and 43, PAR genes and transcripts had the "_PAR_Y" suffix appended to their identifiers. ## Since release 44, these have their own IDs if (!all(grepl("^ENST\\d+", txnames))) { stop("The transcript names must be ENSEMBL or Gencode IDs (ENST...)" ) } ## normalize the transcript names - sig_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', sig_transcripts, perl=TRUE) - r_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE) + r_tx <- normalize_tx_names(txnames) + sig_tx <- normalize_tx_names(sig_transcripts) which(r_tx %in% sig_tx) } + diff --git a/man/check_tx_names.Rd b/man/check_tx_names.Rd deleted file mode 100644 index bc3c515..0000000 --- a/man/check_tx_names.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{check_tx_names} -\alias{check_tx_names} -\title{Check validity of transcript vectors} -\usage{ -check_tx_names(tx1, tx2, arg_name1, arg_name2) -} -\arguments{ -\item{tx1}{A \code{character()} vector of GENCODE or ENSEMBL transcripts.} - -\item{tx2}{A \code{character()} vector of GENCODE or ENSEMBL transcripts.} - -\item{arg_name1}{A \code{character(1)} vector of description of tx1} - -\item{arg_name2}{A \code{character(1)} vector of description of tx2} -} -\value{ -A -\code{character()} vector of transcripts in \code{tx2} that are in \code{tx1}. -} -\description{ -This function is used to check if the tx1 and tx2 are GENCODE or ENSEMBL and print an error message if it's not and return a character vector of transcripts in tx2 that are in tx1. -} -\examples{ -sig_transcripts = select_transcripts("cell_component") -check_tx_names(rownames(covComb_tx_deg), sig_transcripts, 'rownames(covComb_tx_deg)', 'sig_transcripts') -} diff --git a/man/getDegTx.Rd b/man/getDegTx.Rd index 2765b17..7eec8fd 100644 --- a/man/getDegTx.Rd +++ b/man/getDegTx.Rd @@ -7,7 +7,7 @@ getDegTx( rse_tx, type = c("cell_component", "standard", "top1500"), - sig_transcripts = select_transcripts(type), + sig_transcripts = NULL, assayname = "tpm" ) } @@ -43,6 +43,5 @@ postmortem brain tissues. This object can later be used to obtain the principle necessary to remove the effect of degradation in differential expression. } \examples{ -getDegTx(covComb_tx_deg) -stopifnot(mean(rowMeans(assays(covComb_tx_deg)$tpm)) > 1) +degTx <- getDegTx(rse_tx, "standard") }