From 1b49012a7ea11205e6183b504bf6e7d4ae6a14cd Mon Sep 17 00:00:00 2001 From: HediaTnani Date: Fri, 12 Jan 2024 14:01:15 -0500 Subject: [PATCH] Add a function check_tx_names to change sig_transcripts if ENSEMBL Co-authored-by: Nick Eagles --- R/getDegTx.R | 21 +++------------------ R/utils.R | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 18 deletions(-) create mode 100644 R/utils.R diff --git a/R/getDegTx.R b/R/getDegTx.R index fa64584..08f1080 100644 --- a/R/getDegTx.R +++ b/R/getDegTx.R @@ -45,25 +45,10 @@ getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"), stop("rse_tx must be a RangedSummarizedExperiment object.") } - # Check if any gene in sig_transcripts is in rownames(rse_tx) - if (!any(sig_transcripts %in% rownames(rse_tx) | (gsub('\\..*', '', sig_transcripts) %in% rownames(rse_tx)))) { - stop("sig_transcripts and rownames of rse_tx object do not match") - } - - # Check if all rownames start with "ENST" - if (!all(grepl("^ENST", rownames(rse_tx)))) { - stop("Some rownames do not start with 'ENST'.", call. = FALSE) - } - - # Check patterns and perform operations based on the patterns - is_gencode = all(grepl("^ENST.*?\\.", rownames(rse_tx))) - is_ensembl = all(grepl("^ENST", rownames(rse_tx)) & !grepl("\\.", rownames(rse_tx))) - if (is_ensembl) { - sig_transcripts <- gsub("\\..*", "", sig_transcripts) - } else if (!is_gencode) { - stop("Rownames must all be ENSEMBL or GENCODE transcript IDs.") - } + # Check for validity and matching of tx names + sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts') + # Subset rse_tx to include sig_transcripts rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE] # Check if the row means is greater than 1 diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..5988f8d --- /dev/null +++ b/R/utils.R @@ -0,0 +1,44 @@ +#' Check validity of transcript vectors + +#' @export + + +check_tx_names = function(tx1, tx2, arg_name1, arg_name2) { + # Functions for checking whether a vector of transcripts all match GENCODE + # or ENSEMBL naming conventions + is_gencode = function(x) all(grepl("^ENST.*?\\.", x)) + is_ensembl = function(x) all(grepl("^ENST", x) & !grepl("\\.", x)) + + # Check that both vectors either follow GENCODE or ENSEMBL + if (!is_gencode(tx1) && !is_ensembl(tx1)) { + stop( + sprintf( + "'%s' must use either all GENCODE or all ENSEMBL transcript IDs", + arg_name1 + ) + ) + } + if (!is_gencode(tx2) && !is_ensembl(tx2)) { + stop( + sprintf( + "'%s' must use either all GENCODE or all ENSEMBL transcript IDs", + arg_name2 + ) + ) + } + + # Change 'tx2' to match 'tx1', noting that the case where 'tx1' is GENCODE + # but 'tx2' is ENSEMBL is not allowed (and an error will be thrown further + # down) + if (is_gencode(tx2) && is_ensembl(tx1)) { + tx2 = sub('\\..*', '', tx2) + } + + # At least some transcripts must overlap between 'tx1' and 'tx2' + if (!any(tx2 %in% tx1)) { + stop(sprintf("None of '%s' are in '%s'", arg_name2, arg_name1)) + } + + # Since only 'tx2' was modified, return the changed copy + return(tx2) +} \ No newline at end of file