From 81723a2f22f938ecb93323ad65bd1baf8963cae1 Mon Sep 17 00:00:00 2001
From: Geo Pertea <geo.pertea@gmail.com>
Date: Mon, 16 Sep 2024 10:09:16 -0400
Subject: [PATCH] simplify/improve ENSEMBL base transcript name matching

---
 .gitignore   |  1 +
 R/getDegTx.R | 33 ++++++++++++++++++++-------------
 R/utils.R    | 51 ++++++++++++---------------------------------------
 3 files changed, 33 insertions(+), 52 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8b78a3a..1fb1ed6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,5 @@
 .Rdata
 .httr-oauth
 .DS_Store
+dcs04_data
 inst/doc
diff --git a/R/getDegTx.R b/R/getDegTx.R
index 9a65b17..6c7cc2f 100644
--- a/R/getDegTx.R
+++ b/R/getDegTx.R
@@ -31,28 +31,35 @@
 #' @import rlang
 #'
 #' @examples
-#' getDegTx(covComb_tx_deg)
-#' stopifnot(mean(rowMeans(assays(covComb_tx_deg)$tpm)) > 1)
-getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = select_transcripts(type), assayname = "tpm") {
-  
-  type = arg_match(type)
-  
+#' degTx <- getDegTx(rse_tx, "standard")
+getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), sig_transcripts = NULL, assayname = "tpm") {
+
+  #type = arg_match(type)
+  if (is.null(sig_transcripts)) {
+    type = arg_match(type)
+    sig_transcripts <- select_transcripts(type)
+  } else {
+    type = "custom"
+  }
   # Validate rse_tx is a RangedSummarizedExperiment object
   if (!is(rse_tx, "RangedSummarizedExperiment")) {
     stop("'rse_tx' must be a RangedSummarizedExperiment object.", call. = FALSE)
   }
-  
+
   # Check if assayname is in assayNames
   if (!assayname %in% assayNames(rse_tx)) {
     stop(sprintf("'%s' is not in assayNames(rse_tx).", assayname), call. = FALSE)
   }
-  
+
   # Check for validity and matching of tx names
-  sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts')
-  
-  # Subset rse_tx to include sig_transcripts
-  rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE]
-  
+  wtx <- check_tx_names(rownames(rse_tx), sig_transcripts)
+  if (length(wtx) < 10) {
+    stop("Not enough transcript names were found in the '",type, "' degradation model transcripts" )
+  }
+  # if (verbose)
+  message("   '",type,"' degradation model transcripts found: ", length(wtx))
+  rse_tx <- rse_tx[wtx, , drop = FALSE]
+
   # Check if the row means is greater than 1
   if (mean(rowMeans(assays(rse_tx)[[assayname]])) < 1) {
     warning("The transcripts selected are lowly expressed in your dataset. This can impact downstream analysis.")
diff --git a/R/utils.R b/R/utils.R
index 57cc94b..fbd70cc 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -14,46 +14,19 @@
 #' @export
 #'
 #' @examples
-#' sig_transcripts = select_transcripts("cell_component")
-#' check_tx_names(rownames(covComb_tx_deg), sig_transcripts, 'rownames(covComb_tx_deg)', 'sig_transcripts')
+#' sig_tx <-  select_transcripts("cell_component")
+#' whichTx <- check_tx_names(rownames(rse_tx), sig_tx)
 
-
-check_tx_names = function(tx1, tx2, arg_name1, arg_name2) {
+check_tx_names = function(txnames, sig_transcripts) {
   #   Functions for checking whether a vector of transcripts all match GENCODE
   #   or ENSEMBL naming conventions
-  is_gencode = function(x) all(grepl("^ENST.*?\\.", x))
-  is_ensembl = function(x) all(grepl("^ENST", x) & !grepl("\\.", x))
-  
-  #   Check that both vectors either follow GENCODE or ENSEMBL
-  if (!is_gencode(tx1) && !is_ensembl(tx1)) {
-    stop(
-      sprintf(
-        "'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
-        arg_name1
-      )
-    )
-  }
-  if (!is_gencode(tx2) && !is_ensembl(tx2)) {
-    stop(
-      sprintf(
-        "'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
-        arg_name2
-      )
-    )
-  }
-  
-  #   Change 'tx2' to match 'tx1', noting that the case where 'tx1' is GENCODE
-  #   but 'tx2' is ENSEMBL is not allowed (and an error will be thrown further
-  #   down)
-  if (is_gencode(tx2) && is_ensembl(tx1)) {
-    tx2 = sub('\\..*', '', tx2)
-  }
-  
-  #   At least some transcripts must overlap between 'tx1' and 'tx2'
-  if (!any(tx2 %in% tx1)) {
-    stop(sprintf("None of '%s' are in '%s'", arg_name2, arg_name1))
+  ## Between releases 25 and 43, PAR genes and transcripts had the "_PAR_Y" suffix appended to their identifiers.
+  ## Since release 44, these have their own IDs
+  if (!all(grepl("^ENST\\d+", txnames))) {
+    stop("The transcript names must be ENSEMBL or Gencode IDs (ENST...)" )
   }
-  
-  #   Since only 'tx2' was modified, return the changed copy
-  return(tx2)
-}
\ No newline at end of file
+  ## normalize the transcript names
+  sig_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', sig_transcripts, perl=TRUE)
+  r_tx <- sub('(ENST\\d+)\\.\\d+(.*)$','\\1\\2', txnames, perl=TRUE)
+  which(r_tx %in% sig_tx)
+}