Skip to content

Commit

Permalink
Add a function check_tx_names to change sig_transcripts if ENSEMBL
Browse files Browse the repository at this point in the history
Co-authored-by: Nick Eagles <[email protected]>
  • Loading branch information
HediaTnani and Nick-Eagles committed Jan 12, 2024
1 parent 249b7e4 commit 1b49012
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 18 deletions.
21 changes: 3 additions & 18 deletions R/getDegTx.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,25 +45,10 @@ getDegTx <- function(rse_tx, type = c("cell_component", "standard", "top1500"),
stop("rse_tx must be a RangedSummarizedExperiment object.")
}

# Check if any gene in sig_transcripts is in rownames(rse_tx)
if (!any(sig_transcripts %in% rownames(rse_tx) | (gsub('\\..*', '', sig_transcripts) %in% rownames(rse_tx)))) {
stop("sig_transcripts and rownames of rse_tx object do not match")
}

# Check if all rownames start with "ENST"
if (!all(grepl("^ENST", rownames(rse_tx)))) {
stop("Some rownames do not start with 'ENST'.", call. = FALSE)
}

# Check patterns and perform operations based on the patterns
is_gencode = all(grepl("^ENST.*?\\.", rownames(rse_tx)))
is_ensembl = all(grepl("^ENST", rownames(rse_tx)) & !grepl("\\.", rownames(rse_tx)))
if (is_ensembl) {
sig_transcripts <- gsub("\\..*", "", sig_transcripts)
} else if (!is_gencode) {
stop("Rownames must all be ENSEMBL or GENCODE transcript IDs.")
}
# Check for validity and matching of tx names
sig_transcripts = check_tx_names(rownames(rse_tx), sig_transcripts, 'rownames(rse_tx)', 'sig_transcripts')

# Subset rse_tx to include sig_transcripts
rse_tx <- rse_tx[rownames(rse_tx) %in% sig_transcripts, , drop = FALSE]

# Check if the row means is greater than 1
Expand Down
44 changes: 44 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#' Check validity of transcript vectors

#' @export


check_tx_names = function(tx1, tx2, arg_name1, arg_name2) {
# Functions for checking whether a vector of transcripts all match GENCODE
# or ENSEMBL naming conventions
is_gencode = function(x) all(grepl("^ENST.*?\\.", x))
is_ensembl = function(x) all(grepl("^ENST", x) & !grepl("\\.", x))

# Check that both vectors either follow GENCODE or ENSEMBL
if (!is_gencode(tx1) && !is_ensembl(tx1)) {
stop(
sprintf(
"'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
arg_name1
)
)
}
if (!is_gencode(tx2) && !is_ensembl(tx2)) {
stop(
sprintf(
"'%s' must use either all GENCODE or all ENSEMBL transcript IDs",
arg_name2
)
)
}

# Change 'tx2' to match 'tx1', noting that the case where 'tx1' is GENCODE
# but 'tx2' is ENSEMBL is not allowed (and an error will be thrown further
# down)
if (is_gencode(tx2) && is_ensembl(tx1)) {
tx2 = sub('\\..*', '', tx2)
}

# At least some transcripts must overlap between 'tx1' and 'tx2'
if (!any(tx2 %in% tx1)) {
stop(sprintf("None of '%s' are in '%s'", arg_name2, arg_name1))
}

# Since only 'tx2' was modified, return the changed copy
return(tx2)
}

0 comments on commit 1b49012

Please sign in to comment.