filtering the example rse_tx for the vignette

LieberInstitute · Sep 22, 2024 · 5b1f5d7 · 5b1f5d7
1 parent 81723a2
commit 5b1f5d7
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 61 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -5,3 +5,8 @@
 ^\.github$
 ^data-raw$
 ^codecov\.yml$
+^\.Rhistory$
+^\.Rdata$
+^\.httr-oauth$
+^\.DS_Store$
+^dcs04_data$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: qsvaR
 Title: Generate Quality Surrogate Variable Analysis for Degradation Correction
-Version: 1.9.0
-Date: 2024-05-03
+Version: 1.9.1
+Date: 2024-09-16
 Authors@R: 
     c(
     person("Joshua", "Stolz", email = "[email protected]",
@@ -25,7 +25,7 @@ biocViews: Software, WorkflowStep, Normalization, BiologicalQuestion,
     DifferentialExpression, Sequencing, Coverage
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.0
+RoxygenNote: 7.3.2
 Suggests: 
     BiocFileCache,
     BiocStyle,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,13 +1,14 @@
 # Generated by roxygen2: do not edit by hand
 
 export(DEqual)
-export(check_tx_names)
 export(getDegTx)
 export(getPCs)
 export(get_qsvs)
 export(k_qsvs)
+export(normalize_tx_names)
 export(qSVA)
 export(select_transcripts)
+export(which_tx_names)
 import(SummarizedExperiment)
 import(ggplot2)
 import(rlang)

diff --git a/R/DEqual.R b/R/DEqual.R
@@ -43,33 +43,34 @@ DEqual <- function(DE) {
     ## Check input
     # stopifnot("t" %in% colnames(DE))
     # stopifnot(!is.null(rownames(DE)))
-    
+
     # Check if input is a dataframe
-    if (!is.data.frame(DE)) { 
-      stop("The input to DEqual is not a dataframe.", call. = FALSE) 
+    if (!is.data.frame(DE)) {
+      stop("The input to DEqual is not a dataframe.", call. = FALSE)
       }
 
     # Check if 't' is in the column names of DE
     if (!("t" %in% colnames(DE))) {
         stop("'t' is not a column in 'DE'.", call. = FALSE)
       }
-    
+
     # Check if DE has non-null row names
     if (is.null(rownames(DE))) {
         stop("Row names of 'DE' are NULL.", call. = FALSE)
       }
-    
+
     ## Locate common transcripts
     deg_tstats = qsvaR::degradation_tstats
-    rownames(deg_tstats) = check_tx_names(rownames(DE),rownames(qsvaR::degradation_tstats),'rownames(DE)','qsvaR::degradation_tstats')
-    common = intersect(rownames(deg_tstats), rownames(DE))
-
+    whichTx <- which_tx_names(rownames(DE),rownames(deg_tstats))
+    #rownames(deg_tstats) = check_tx_names(rownames(DE),rownames(qsvaR::degradation_tstats),'rownames(DE)','qsvaR::degradation_tstats')
+    #common = intersect(rownames(deg_tstats), rownames(DE))
+    common = qsvaR::normalize_tx_names(rownames(DE)[whichTx])
     stopifnot(length(common) > 0)
-
+    rownames(deg_tstats) <- qsvaR::normalize_tx_names(rownames(deg_tstats))
     ## Create dataframe with common transcripts
     common_data <- data.frame(
-        degradation_t = deg_tstats$t[match(common, rownames(deg_tstats))],
-        DE_t = DE$t[match(common, rownames(DE))]
+        degradation_t = deg_tstats[common, ]$t,
+        DE_t = DE[whichTx, ]$t
     )
     p <- ggplot(common_data, aes(x = DE_t, y = degradation_t)) +
         xlab("DE t-statistic") +

diff --git a/R/getDegTx.R b/R/getDegTx.R
@@ -51,8 +51,8 @@ getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"),
     stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE)
   }
 
-  # Check for validity and matching of tx names
-  wtx <- check_tx_names(rownames(rse_tx), sig_transcripts)
+  # Check for validity and matching of tx names and return the tx subset indexes in rse_tx
+  wtx <- which_tx_names(rownames(rse_tx), sig_transcripts)
   if (length(wtx) < 10) {
     stop("Not enough transcript names were found in the '",type, "' degradation model transcripts" )
   }

diff --git a/R/utils.R b/R/utils.R
@@ -1,32 +1,53 @@
-#' Check validity of transcript vectors
+
+
+
+#' Remove version number from Gencode/Ensembl transcript names
 #'
-#' This function is used to check if the tx1 and tx2 are GENCODE or ENSEMBL and print an error message if it's not and return a character vector of transcripts in tx2 that are in tx1.
+#' This function removes the Gencode/ENSEMBL version from the transcript ID, while protecting _PAR_Y suffixes if present
 #'
-#' @param tx1 A `character()` vector of GENCODE or ENSEMBL transcripts.
-#' @param tx2 A `character()` vector of GENCODE or ENSEMBL transcripts.
+#' @param txnames A `character()` vector of GENCODE or ENSEMBL transcript IDs
 #'
-#' @param arg_name1 A `character(1)` vector of description of tx1
-#' @param arg_name2 A `character(1)` vector of description of tx2
 #'
 #' @return A
-#'  `character()` vector of transcripts in `tx2` that are in `tx1`.
+#'  `character()` vector of transcript names without versioning
+#'
+#' @export
+#'
+#' @examples
+#' ensIDs <-  normalize_tx_names(rownames(rse_tx))
+
+normalize_tx_names <- function(txnames) {
+  sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE)
+}
+
+
+#' Check validity of transcript vectors and return a vector matching indexes in tx1
+#'
+#' This function is used to check if tx1 and tx2 are GENCODE or ENSEMBL transcript IDs
+#' and return an integer vector of tx1 transcript indexes that are in tx2.
+#'
+#' @param tx1 A `character()` vector of GENCODE or ENSEMBL transcript IDs.
+#' @param tx2 A `character()` vector of GENCODE or ENSEMBL transcript IDs.
+#'
+#'
+#' @return A
+#'  `integer()` vector of `tx1` transcript indexes in `tx2`.
 #'
 #' @export
 #'
 #' @examples
 #' sig_tx <-  select_transcripts("cell_component")
-#' whichTx <- check_tx_names(rownames(rse_tx), sig_tx)
+#' whichTx <- which_tx_names(rownames(rse_tx), sig_tx)
 
-check_tx_names = function(txnames, sig_transcripts) {
-  #   Functions for checking whether a vector of transcripts all match GENCODE
-  #   or ENSEMBL naming conventions
+which_tx_names = function(txnames, sig_transcripts) {
   ## Between releases 25 and 43, PAR genes and transcripts had the "_PAR_Y" suffix appended to their identifiers.
   ## Since release 44, these have their own IDs
   if (!all(grepl("^ENST\\d+", txnames))) {
     stop("The transcript names must be ENSEMBL or Gencode IDs (ENST...)" )
   }
   ## normalize the transcript names
-  sig_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', sig_transcripts, perl=TRUE)
-  r_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE)
+  r_tx <- normalize_tx_names(txnames)
+  sig_tx <- normalize_tx_names(sig_transcripts)
   which(r_tx %in% sig_tx)
 }
+
diff --git a/man/check_tx_names.Rd b/man/check_tx_names.Rd
diff --git a/man/getDegTx.Rd b/man/getDegTx.Rd