From 1b49012a7ea11205e6183b504bf6e7d4ae6a14cd Mon Sep 17 00:00:00 2001
From: HediaTnani <hediatnani0@gmail.com>
Date: Fri, 12 Jan 2024 14:01:15 -0500
Subject: [PATCH] Add a function check_tx_names to change sig_transcripts if
 ENSEMBL

Co-authored-by: Nick Eagles <nick.eagles@libd.org>
---
 R/getDegTx.R | 21 +++------------------
 R/utils.R    | 44 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 18 deletions(-)
 create mode 100644 R/utils.R

diff --git a/R/getDegTx.R b/R/getDegTx.R
index fa64584..08f1080 100644
--- a/R/getDegTx.R
+++ b/R/getDegTx.R
@@ -45,25 +45,10 @@ getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"),
     stop("rse_tx must be a RangedSummarizedExperiment object.")
   }
   
-  # Check if any gene in sig_transcripts is in rownames(rse_tx)
-  if (!any(sig_transcripts %in% rownames(rse_tx) | (gsub('\\..*', '', sig_transcripts) %in% rownames(rse_tx)))) {
-    stop("sig_transcripts and rownames of rse_tx object do not match")
-  }
-  
-  # Check if all rownames start with "ENST"
-  if (!all(grepl("^ENST", rownames(rse_tx)))) {
-    stop("Some rownames do not start with 'ENST'.", call. = FALSE)
-  }
-  
-  # Check patterns and perform operations based on the patterns
-  is_gencode = all(grepl("^ENST.*?\\.", rownames(rse_tx)))
-  is_ensembl = all(grepl("^ENST", rownames(rse_tx)) & !grepl("\\.", rownames(rse_tx)))
-  if (is_ensembl) {
-    sig_transcripts <- gsub("\\..*", "", sig_transcripts)
-  } else if (!is_gencode) {
-    stop("Rownames must all be ENSEMBL or GENCODE transcript IDs.")
-  }
+  # Check for validity and matching of tx names
+  sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts')
   
+  # Subset rse_tx to include sig_transcripts
   rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE]
   
   # Check if the row means is greater than 1
diff --git a/R/utils.R b/R/utils.R
new file mode 100644
index 0000000..5988f8d
--- /dev/null
+++ b/R/utils.R
@@ -0,0 +1,44 @@
+#' Check validity of transcript vectors
+
+#' @export
+
+
+check_tx_names = function(tx1, tx2, arg_name1, arg_name2) {
+  #   Functions for checking whether a vector of transcripts all match GENCODE
+  #   or ENSEMBL naming conventions
+  is_gencode = function(x) all(grepl("^ENST.*?\\.", x))
+  is_ensembl = function(x) all(grepl("^ENST", x) & !grepl("\\.", x))
+  
+  #   Check that both vectors either follow GENCODE or ENSEMBL
+  if (!is_gencode(tx1) && !is_ensembl(tx1)) {
+    stop(
+      sprintf(
+        "'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
+        arg_name1
+      )
+    )
+  }
+  if (!is_gencode(tx2) && !is_ensembl(tx2)) {
+    stop(
+      sprintf(
+        "'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
+        arg_name2
+      )
+    )
+  }
+  
+  #   Change 'tx2' to match 'tx1', noting that the case where 'tx1' is GENCODE
+  #   but 'tx2' is ENSEMBL is not allowed (and an error will be thrown further
+  #   down)
+  if (is_gencode(tx2) && is_ensembl(tx1)) {
+    tx2 = sub('\\..*', '', tx2)
+  }
+  
+  #   At least some transcripts must overlap between 'tx1' and 'tx2'
+  if (!any(tx2 %in% tx1)) {
+    stop(sprintf("None of '%s' are in '%s'", arg_name2, arg_name1))
+  }
+  
+  #   Since only 'tx2' was modified, return the changed copy
+  return(tx2)
+}
\ No newline at end of file