From 0985c7f19f0d4571e21e4889d11e4be9577aa146 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Tue, 28 Mar 2023 11:41:50 -0400 Subject: [PATCH 1/9] fix: AAStrings now enforce character set --- DESCRIPTION | 4 ++-- R/XString-class.R | 2 +- R/XStringCodec-class.R | 28 +++++++++++++++++++++++++++- R/seqtype.R | 18 +++++++++++------- R/zzz.R | 3 +++ src/Biostrings.h | 7 +++++++ src/R_init_Biostrings.c | 3 +++ src/XString_class.c | 33 ++++++++++++++++++++++++++++++++- 8 files changed, 86 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8ece12da..e0b7f0fc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,12 +19,12 @@ Authors@R: c( person("Vince", "Carey", role="ctb"), person("Nicolas", "Delhomme", role="ctb"), person("Felix", "Ernst", role="ctb"), - person("Aidan", "Lakshman", role="ctb")) + person("Aidan", "Lakshman", role="ctb"), person("Kieran", "O'Neill", role="ctb"), person("Valerie", "Obenchain", role="ctb"), person("Marcel", "Ramos", role="ctb"), person("Albert", "Vill", role="ctb"), - person("Eric", "Wright", role="ctb"), + person("Erik", "Wright", role="ctb")) Depends: R (>= 4.0.0), methods, BiocGenerics (>= 0.37.0), S4Vectors (>= 0.27.12), IRanges (>= 2.31.2), XVector (>= 0.37.1), GenomeInfoDb diff --git a/R/XString-class.R b/R/XString-class.R index c3761748..5a8e0828 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -249,7 +249,7 @@ setMethod("XString", "AsIs", function(seqtype, x, start=NA, end=NA, width=NA) { if (!is.character(x)) - stop("unsuported input type") + stop("unsupported input type") class(x) <- "character" # keeps the names (unlike as.character()) XString(seqtype, x, start=start, end=end, width=width) } diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R index 5e4dbee1..45b2b368 100644 --- a/R/XStringCodec-class.R +++ b/R/XStringCodec-class.R @@ -194,7 +194,33 @@ getRNAComplementLookup <- function() } + ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### Add extra codecs below... +### The AA alphabet and codec. +### + +### AAStrings don't need to support DNA/RNA or complementation, +### so the code to generate the codec can be a lot simpler +AA_LOOKUP_CODES <- c(A=1L, R=2L, N=3L, D=4L, + C=5L, Q=6L, E=7L, G=8L, + H=9L, I=10L, L=11L, K=12L, + M=13L, F=14L, P=15L, S=16L, + T=17L, W=18L, Y=19L, V=20L, + U=21L, O=22L, + B=23L, J=24L, Z=25L, X=26L, + `*`=27L, `-`=28L, `+`=29L, `.`=30L) + +### AA codec. +.XStringCodec.AA <- function(codes) +{ + letters <- names(codes) + extra_letters <- setdiff(tolower(letters), letters) + extra_codes <- codes[toupper(extra_letters)] + new("XStringCodec", letters, codes, extra_letters, extra_codes) +} +AAcodes <- function(baseOnly) AA_LOOKUP_CODES[seq_len(ifelse(baseOnly, 20, 30))] +AA_STRING_CODEC <- .XStringCodec.AA(AA_LOOKUP_CODES) +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Add extra codecs below... diff --git a/R/seqtype.R b/R/seqtype.R index 9958cc08..6af2c231 100644 --- a/R/seqtype.R +++ b/R/seqtype.R @@ -11,7 +11,7 @@ ### "B" | general purpose string(s) | bytes 0-255 | no ### "DNA" | DNA sequence(s) | DNA_ALPHABET | yes ### "RNA" | RNA sequence(s) | RNA_ALPHABET | yes -### "AA" | amino acid sequence(s) | AA_ALPHABET | no +### "AA" | amino acid sequence(s) | AA_ALPHABET | yes ### ### seqtype() returns that sequence type. For example 'seqtype(AAString())' ### returns "AA". @@ -25,7 +25,7 @@ ### string containers: XString (single sequence), XStringSet (multiple ### sequences), XStringViews (multiple sequences) and MaskedXString (single ### sequence). -### +### ### Exported. setGeneric("seqtype", function(x) standardGeneric("seqtype")) @@ -35,11 +35,10 @@ setGeneric("seqtype<-", signature="x", function(x, value) standardGeneric("seqtype<-") ) - ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ### Helper functions for which the returned value depends on 'seqtype(x)', ### not on what particular data are in 'x'. Not exported. -### +### xsbaseclass <- function(x) paste(seqtype(x), "String", sep="") @@ -56,6 +55,7 @@ setMethod("xscodes", "ANY", switch(seqtype, DNA=DNAcodes(baseOnly), RNA=RNAcodes(baseOnly), + AA=AAcodes(baseOnly), 0:255 ) } @@ -66,6 +66,7 @@ xscodec <- function(x) switch(seqtype(x), DNA=DNA_STRING_CODEC, RNA=RNA_STRING_CODEC, + AA=AA_STRING_CODEC, NULL ) } @@ -109,18 +110,22 @@ get_seqtype_conversion_lookup <- function(from_seqtype, to_seqtype) if (!compatible_seqtypes(from_seqtype, to_seqtype)) stop("incompatible sequence types \"", from_seqtype, "\" and \"", to_seqtype, "\"") - from_nucleo <- from_seqtype %in% c("DNA", "RNA") - to_nucleo <- to_seqtype %in% c("DNA", "RNA") + from_nucleo <- from_seqtype %in% c("DNA", "RNA", "AA") + to_nucleo <- to_seqtype %in% c("DNA", "RNA", "AA") if (from_nucleo == to_nucleo) return(NULL) if (to_seqtype == "DNA") return(DNA_STRING_CODEC@enc_lkup) if (to_seqtype == "RNA") return(RNA_STRING_CODEC@enc_lkup) + if (to_seqtype == "AA") + return(AA_STRING_CODEC@enc_lkup) if (from_seqtype == "DNA") return(DNA_STRING_CODEC@dec_lkup) if (from_seqtype == "RNA") return(RNA_STRING_CODEC@dec_lkup) + if (from_seqtype == "AA") + return(AA_STRING_CODEC@dec_lkup) stop("Biostrings internal error, please report") # should never happen } @@ -155,4 +160,3 @@ setMethod("alphabet", "ANY", ) } ) - diff --git a/R/zzz.R b/R/zzz.R index fb91302d..65e22402 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -8,6 +8,9 @@ .Call2("init_RNAlkups", RNA_STRING_CODEC@enc_lkup, RNA_STRING_CODEC@dec_lkup, PACKAGE=pkgname) + .Call2("init_AAlkups", + AA_STRING_CODEC@enc_lkup, AA_STRING_CODEC@dec_lkup, + PACKAGE=pkgname) DNA_AND_RNA_COLORED_LETTERS <<- make_DNA_AND_RNA_COLORED_LETTERS() AA_COLORED_LETTERS <<- make_AA_COLORED_LETTERS() option_name <- "Biostrings.coloring" diff --git a/src/Biostrings.h b/src/Biostrings.h index 313089a4..c5f211ac 100644 --- a/src/Biostrings.h +++ b/src/Biostrings.h @@ -78,6 +78,13 @@ char _RNAencode(char c); char _RNAdecode(char code); +SEXP init_AAlkups(SEXP enc_lkup, SEXP dec_lkup); + +char _AAencode(char c); + +char _AAdecode(char code); + + void _copy_CHARSXP_to_Chars_holder( Chars_holder *dest, SEXP src, diff --git a/src/R_init_Biostrings.c b/src/R_init_Biostrings.c index 9f11703d..c0433e4b 100644 --- a/src/R_init_Biostrings.c +++ b/src/R_init_Biostrings.c @@ -29,6 +29,7 @@ static const R_CallMethodDef callMethods[] = { /* XString_class.c */ CALLMETHOD_DEF(init_DNAlkups, 2), CALLMETHOD_DEF(init_RNAlkups, 2), + CALLMETHOD_DEF(init_AAlkups, 2), CALLMETHOD_DEF(new_XString_from_CHARACTER, 5), /* XStringSet_class.c */ @@ -173,6 +174,8 @@ void R_init_Biostrings(DllInfo *info) REGISTER_CCALLABLE(_DNAdecode); REGISTER_CCALLABLE(_RNAencode); REGISTER_CCALLABLE(_RNAdecode); + REGISTER_CCALLABLE(_AAencode); + REGISTER_CCALLABLE(_AAdecode); /* XStringSet_class.c */ REGISTER_CCALLABLE(_get_XStringSet_length); diff --git a/src/XString_class.c b/src/XString_class.c index bbe0124a..8bf7cf30 100644 --- a/src/XString_class.c +++ b/src/XString_class.c @@ -14,7 +14,8 @@ */ static ByteTrTable DNA_enc_byte2code, DNA_dec_byte2code, - RNA_enc_byte2code, RNA_dec_byte2code; + RNA_enc_byte2code, RNA_dec_byte2code, + AA_enc_byte2code, AA_dec_byte2code; const ByteTrTable *get_enc_byte2code(const char *classname) { @@ -95,6 +96,36 @@ char _RNAdecode(char code) } +/* --- .Call ENTRY POINT --- */ +SEXP init_AAlkups(SEXP enc_lkup, SEXP dec_lkup) +{ + _init_ByteTrTable_with_lkup(&AA_enc_byte2code, enc_lkup); + _init_ByteTrTable_with_lkup(&AA_dec_byte2code, dec_lkup); + return R_NilValue; +} + +char _AAencode(char c) +{ + int code; + + code = AA_enc_byte2code.byte2code[(unsigned char) c]; + if (code == NA_INTEGER) + error("_AAencode(): invalid AAString " + "input character: '%c' (byte value %d)", c, (int) c); + return code; +} + +char _AAdecode(char code) +{ + int c; + + c = AA_dec_byte2code.byte2code[(unsigned char) code]; + if (c == NA_INTEGER) + error("_AAdecode(): invalid AAString " + "internal code: %d", (int) code); + return (char) c; +} + /**************************************************************************** * From CHARACTER to XString and vice-versa. */ From 8eb59876f25e8d08354471a4e59a31d15ea5014a Mon Sep 17 00:00:00 2001 From: ahl27 Date: Tue, 28 Mar 2023 11:58:01 -0400 Subject: [PATCH 2/9] fix: formatting and documentation update --- man/XStringSet-io.Rd | 9 ++++----- src/Biostrings.h | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/man/XStringSet-io.Rd b/man/XStringSet-io.Rd index 71475bb3..1f389842 100644 --- a/man/XStringSet-io.Rd +++ b/man/XStringSet-io.Rd @@ -137,9 +137,8 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE) \item \code{"RNA"} for RNA sequences i.e. only letters in \code{\link{RNA_ALPHABET}} (case ignored) are valid one-letter sequence codes. - \item \code{"AA"} for Amino Acid sequences. Currently treated as - \code{"B"} but this will change in the near future i.e. only - letters in \code{\link{AA_ALPHABET}} (case ignored) will be + \item \code{"AA"} for Amino Acid sequences i.e. only + letters in \code{\link{AA_ALPHABET}} (case ignored) are valid one-letter sequence codes. } Invalid one-letter sequence codes are ignored with a warning. @@ -192,7 +191,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE) \item{save.dups}{ \code{TRUE} or \code{FALSE}. If \code{TRUE} then the \code{\link[IRanges:Grouping-class]{Dups}} - object describing + object describing how duplicated elements in \code{x} are related to each other is saved too. For advanced users only. } @@ -210,7 +209,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE) load sequences from an input file (or multiple input files) into an \link{XStringSet} object. When multiple input files are specified, all must have the same format (i.e. FASTA or FASTQ) and files with - different compression types can be mixed with non-compressed files. + different compression types can be mixed with non-compressed files. The files are read in the order they were specified and the sequences are stored in the returned object in the order they were read. diff --git a/src/Biostrings.h b/src/Biostrings.h index c5f211ac..0cca89e6 100644 --- a/src/Biostrings.h +++ b/src/Biostrings.h @@ -84,7 +84,6 @@ char _AAencode(char c); char _AAdecode(char code); - void _copy_CHARSXP_to_Chars_holder( Chars_holder *dest, SEXP src, From ac715df0313194eb1005f24c9d001498e0a3dbb5 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Wed, 29 Mar 2023 17:22:46 -0400 Subject: [PATCH 3/9] remove encode/decode for simpler version --- R/XStringCodec-class.R | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R index 45b2b368..636633a6 100644 --- a/R/XStringCodec-class.R +++ b/R/XStringCodec-class.R @@ -201,25 +201,26 @@ getRNAComplementLookup <- function() ### AAStrings don't need to support DNA/RNA or complementation, ### so the code to generate the codec can be a lot simpler -AA_LOOKUP_CODES <- c(A=1L, R=2L, N=3L, D=4L, - C=5L, Q=6L, E=7L, G=8L, - H=9L, I=10L, L=11L, K=12L, - M=13L, F=14L, P=15L, S=16L, - T=17L, W=18L, Y=19L, V=20L, - U=21L, O=22L, - B=23L, J=24L, Z=25L, X=26L, - `*`=27L, `-`=28L, `+`=29L, `.`=30L) +AAcodes <- function(baseOnly) +{ + if (!isTRUEorFALSE(baseOnly)) + stop("'baseOnly' must be TRUE or FALSE") + letters <- AA_ALPHABET + if (baseOnly) + letters <- head(letters, n=20L) + setNames(.letterAsByteVal(letters), letters) +} ### AA codec. .XStringCodec.AA <- function(codes) { letters <- names(codes) extra_letters <- setdiff(tolower(letters), letters) - extra_codes <- codes[toupper(extra_letters)] + extra_codes <- .letterAsByteVal(toupper(extra_letters)) new("XStringCodec", letters, codes, extra_letters, extra_codes) } -AAcodes <- function(baseOnly) AA_LOOKUP_CODES[seq_len(ifelse(baseOnly, 20, 30))] +AA_LOOKUP_CODES <- AAcodes(FALSE) AA_STRING_CODEC <- .XStringCodec.AA(AA_LOOKUP_CODES) ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From a77e0c60e9a556667b829d2337396f508b042ee8 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Thu, 30 Mar 2023 08:41:48 -0400 Subject: [PATCH 4/9] adds C routines to Biostrings_interface and _Biostrings_stubs --- inst/include/Biostrings_interface.h | 4 ++++ inst/include/_Biostrings_stubs.c | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/inst/include/Biostrings_interface.h b/inst/include/Biostrings_interface.h index dc8000c8..e119bc55 100644 --- a/inst/include/Biostrings_interface.h +++ b/inst/include/Biostrings_interface.h @@ -128,6 +128,10 @@ char RNAencode(char c); char RNAdecode(char code); +char AAencode(char c); + +char AAdecode(char code); + /* * Low-level manipulation of XStringSet objects. * (see XStringSet_class.c) diff --git a/inst/include/_Biostrings_stubs.c b/inst/include/_Biostrings_stubs.c index 3cfb9a15..86e72fb3 100644 --- a/inst/include/_Biostrings_stubs.c +++ b/inst/include/_Biostrings_stubs.c @@ -51,6 +51,16 @@ DEFINE_CCALLABLE_STUB(char, RNAdecode, ( code) ) +DEFINE_CCALLABLE_STUB(char, AAencode, + (char c), + ( c) +) + +DEFINE_CCALLABLE_STUB(char, AAdecode, + (char code), + ( code) +) + /* * Stubs for callables defined in XStringSet_class.c */ From 35d7ca9ac94330382c0db0ad42c36df0edb93d36 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Wed, 12 Apr 2023 09:21:09 -0400 Subject: [PATCH 5/9] fix: `baseOnly` argument for AAString and AAStringSet --- R/letterFrequency.R | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/R/letterFrequency.R b/R/letterFrequency.R index 36f2fe89..65e8cd78 100644 --- a/R/letterFrequency.R +++ b/R/letterFrequency.R @@ -131,14 +131,13 @@ ans } -.XString.amino_acid_frequency <- function(x, as.prob) +.XString.amino_acid_frequency <- function(x, as.prob, baseOnly) { if (!isTRUEorFALSE(as.prob)) stop("'as.prob' must be TRUE or FALSE") - codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse=""))) - names(codes) <- AA_ALPHABET + codes <- xscodes(x, baseOnly=baseOnly) ans <- .Call2("XString_letter_frequency", - x, codes, TRUE, + x, codes, baseOnly, PACKAGE="Biostrings") if (as.prob) ans <- ans / nchar(x) # nchar(x) is sum(ans) but faster @@ -180,15 +179,14 @@ ans } -.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse) +.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse, baseOnly) { if (!isTRUEorFALSE(as.prob)) stop("'as.prob' must be TRUE or FALSE") collapse <- .normargCollapse(collapse) - codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse=""))) - names(codes) <- AA_ALPHABET + codes <- xscodes(x, baseOnly=baseOnly) ans <- .Call2("XStringSet_letter_frequency", - x, collapse, codes, TRUE, + x, collapse, codes, baseOnly, PACKAGE="Biostrings") if (as.prob) { if (collapse) @@ -219,8 +217,8 @@ setMethod("alphabetFrequency", "RNAString", ) setMethod("alphabetFrequency", "AAString", - function(x, as.prob=FALSE) - .XString.amino_acid_frequency(x, as.prob) + function(x, as.prob=FALSE, baseOnly=FALSE) + .XString.amino_acid_frequency(x, as.prob, baseOnly) ) setMethod("alphabetFrequency", "XStringSet", @@ -239,8 +237,8 @@ setMethod("alphabetFrequency", "RNAStringSet", ) setMethod("alphabetFrequency", "AAStringSet", - function(x, as.prob=FALSE, collapse=FALSE) - .XStringSet.amino_acid_frequency(x, as.prob, collapse) + function(x, as.prob=FALSE, collapse=FALSE, baseOnly=FALSE) + .XStringSet.amino_acid_frequency(x, as.prob, collapse, baseOnly) ) ### library(drosophila2probe) From 6f8fb36e5e15d304908e7b0e9c6367f18f8490e3 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Wed, 12 Apr 2023 09:29:32 -0400 Subject: [PATCH 6/9] fix: adds methods for hasOnlyBaseLetters for AAString and AAStringSet, updates man page --- R/letterFrequency.R | 10 ++++++++++ man/letterFrequency.Rd | 19 +++++++++++-------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/R/letterFrequency.R b/R/letterFrequency.R index 65e8cd78..39d754e8 100644 --- a/R/letterFrequency.R +++ b/R/letterFrequency.R @@ -279,6 +279,11 @@ setMethod("hasOnlyBaseLetters", "RNAString", function(x) hasOnlyBaseLetters(DNAString(x)) ) +setMethod("hasOnlyBaseLetters", "AAString", + function(x) + alphabetFrequency(x, baseOnly=TRUE)[["other"]] == 0L +) + setMethod("hasOnlyBaseLetters", "DNAStringSet", function(x) alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L @@ -288,6 +293,11 @@ setMethod("hasOnlyBaseLetters", "RNAStringSet", function(x) hasOnlyBaseLetters(DNAStringSet(x)) ) +setMethod("hasOnlyBaseLetters", "AAStringSet", + function(x) + alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L +) + setMethod("hasOnlyBaseLetters", "XStringViews", function(x) { diff --git a/man/letterFrequency.Rd b/man/letterFrequency.Rd index ab2cdce9..6e94befa 100644 --- a/man/letterFrequency.Rd +++ b/man/letterFrequency.Rd @@ -92,7 +92,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...) \S4method{consensusString}{matrix}(x, ambiguityMap="?", threshold=0.5) \S4method{consensusString}{DNAStringSet}(x, ambiguityMap=IUPAC_CODE_MAP, threshold=0.25, shift=0L, width=NULL) -\S4method{consensusString}{RNAStringSet}(x, +\S4method{consensusString}{RNAStringSet}(x, ambiguityMap= structure(as.character(RNAStringSet(DNAStringSet(IUPAC_CODE_MAP))), names= @@ -210,7 +210,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...) those sequences have been shifted (see the \code{shift} argument above). This ensures that any wider consensus matrix would be a "padded with zeros" version of the matrix returned when \code{width=NULL}. - + The length of the returned sequence for the \code{consensusString} method for \link{XStringSet} objects. } @@ -255,10 +255,12 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...) \link{XStringSet} or \link{XStringViews} object, then it returns an integer matrix with \code{length(x)} rows where the \code{i}-th row contains the frequencies for \code{x[[i]]}. - If \code{x} is a DNA or RNA input, then the returned vector is named + If \code{x} is a DNA, RNA, or AA input, then the returned vector is named with the letters in the alphabet. If the \code{baseOnly} argument is - \code{TRUE}, then the returned vector has only 5 elements: 4 elements - corresponding to the 4 nucleotides + the 'other' element. + \code{TRUE}, then the returned vector has only 5 elements for DNA/RNA input + (4 elements corresponding to the 4 nucleotides + the 'other' element) and + 21 elements for AA input (20 elements corresponding to the 20 base amino acids + + the 'other' element). \code{letterFrequency} returns, similarly, an integer vector or matrix, but restricted and/or collated according to \code{letters} and \code{OR}. @@ -270,7 +272,8 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...) \code{hasOnlyBaseLetters} returns \code{TRUE} or \code{FALSE} indicating whether or not \code{x} contains only base letters (i.e. As, Cs, Gs and Ts - for DNA input and As, Cs, Gs and Us for RNA input). + for DNA input, As, Cs, Gs and Us for RNA input, or any of the 20 standard + amino acids for AA input). \code{uniqueLetters} returns a vector of 1-letter or empty strings. The empty string is used to represent the nul character if \code{x} happens to contain @@ -391,8 +394,8 @@ consensusString(sort(probes)[1:5], ambiguityMap = "N", threshold = 0.5) ## Consensus involving ambiguity letters in the input strings consensusString(DNAStringSet(c("NNNN","ACTG"))) consensusString(DNAStringSet(c("AANN","ACTG"))) -consensusString(DNAStringSet(c("ACAG","ACAR"))) -consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG"))) +consensusString(DNAStringSet(c("ACAG","ACAR"))) +consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG"))) ## --------------------------------------------------------------------- ## C. RELATIONSHIP BETWEEN consensusMatrix() AND coverage() From ef1269fa659c9d0252d3dbfd39f72d0d09c25312 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Wed, 12 Apr 2023 10:18:00 -0400 Subject: [PATCH 7/9] adds updateObject methods for AAString and extensions --- R/XString-class.R | 16 ++++++++++++++++ R/XStringSet-class.R | 10 ++++++++++ R/XStringSetList-class.R | 10 ++++++++++ 3 files changed, 36 insertions(+) diff --git a/R/XString-class.R b/R/XString-class.R index 5a8e0828..8567ebca 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -481,3 +481,19 @@ setMethod("updateObject", "XString", } ) +setMethod("updateObject", "AAString", + function(object, ..., verbose=FALSE) + { + codec <- xscodec(AAString()) + class(object) <- "BString" + mapping <- vapply(uniqueLetters(object), utf8ToInt, integer(1L)) + missingVals <- is.na(codec@enc_lkup[mapping+1L]) + if(any(missingVals)){ + errorChars <- paste(names(mapping)[which(missingVals)], + collapse=', ') + stop("Cannot decode, AAString contains invalid character(s): ", + errorChars) + } + AAString(object) + } +) diff --git a/R/XStringSet-class.R b/R/XStringSet-class.R index 6166aed9..e331e62d 100644 --- a/R/XStringSet-class.R +++ b/R/XStringSet-class.R @@ -624,3 +624,13 @@ setMethod("updateObject", "XStringSet", } ) +setMethod("updateObject", "AAStringSet", + function(object, ..., verbose=FALSE) + { + for(i in seq_along(object)){ + object[[i]] <- updateObject(object[[i]]) + } + object + } +) + diff --git a/R/XStringSetList-class.R b/R/XStringSetList-class.R index f1837780..752e2234 100644 --- a/R/XStringSetList-class.R +++ b/R/XStringSetList-class.R @@ -271,3 +271,13 @@ setMethod("showAsCell", "XStringSetList", setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList) +setMethod("updateObject", "AAStringSetList", + function(object, ..., verbose=FALSE) + { + lst <- as.list(object) + for(i in seq_along(lst)){ + lst[[i]] <- updateObject(lst[[i]]) + } + AAStringSetList(lst) + } +) From 2505f1c974846f49affcccf77401e4eecd23e372 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Wed, 12 Apr 2023 10:28:09 -0400 Subject: [PATCH 8/9] fix: comments --- R/XString-class.R | 1 + R/XStringSet-class.R | 2 ++ R/XStringSetList-class.R | 7 +++++++ 3 files changed, 10 insertions(+) diff --git a/R/XString-class.R b/R/XString-class.R index 8567ebca..82e584b4 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -481,6 +481,7 @@ setMethod("updateObject", "XString", } ) +### Update AAString objects created before AA_ALPHABET was enforced setMethod("updateObject", "AAString", function(object, ..., verbose=FALSE) { diff --git a/R/XStringSet-class.R b/R/XStringSet-class.R index e331e62d..c6e6fedc 100644 --- a/R/XStringSet-class.R +++ b/R/XStringSet-class.R @@ -624,6 +624,8 @@ setMethod("updateObject", "XStringSet", } ) +### Update AAStringSet objects created before AA_ALPHABET was enforced +### for AAString objects setMethod("updateObject", "AAStringSet", function(object, ..., verbose=FALSE) { diff --git a/R/XStringSetList-class.R b/R/XStringSetList-class.R index 752e2234..3f617a96 100644 --- a/R/XStringSetList-class.R +++ b/R/XStringSetList-class.R @@ -271,6 +271,13 @@ setMethod("showAsCell", "XStringSetList", setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList) + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### updateObject() +### + +### Update AAStringSetList objects created before AA_ALPHABET was enforced +### for AAString objects setMethod("updateObject", "AAStringSetList", function(object, ..., verbose=FALSE) { From 0ab5eae4dc9f5f5403c3eacac875f42f39b3f223 Mon Sep 17 00:00:00 2001 From: ahl27 Date: Wed, 28 Jun 2023 12:22:01 -0400 Subject: [PATCH 9/9] Updates updateObject methods according to feedback from Herve --- R/XString-class.R | 8 ++++++-- R/XStringSet-class.R | 24 ++++++++++++++++++++---- R/XStringSetList-class.R | 18 ------------------ 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/R/XString-class.R b/R/XString-class.R index 82e584b4..cec49645 100644 --- a/R/XString-class.R +++ b/R/XString-class.R @@ -474,10 +474,11 @@ setMethod("updateObject", "XString", ans_shared <- new("SharedRaw") ans_shared@xp <- xdata@xp ans_shared@.link_to_cached_object=xdata@.link_to_cached_object - new(class(object), + new2(class(object), shared=ans_shared, offset=object@offset, - length=object@length) + length=object@length, + check=FALSE) } ) @@ -485,6 +486,9 @@ setMethod("updateObject", "XString", setMethod("updateObject", "AAString", function(object, ..., verbose=FALSE) { + # Start by calling general XString update function + object <- callNextMethod() + codec <- xscodec(AAString()) class(object) <- "BString" mapping <- vapply(uniqueLetters(object), utf8ToInt, integer(1L)) diff --git a/R/XStringSet-class.R b/R/XStringSet-class.R index c6e6fedc..9e328074 100644 --- a/R/XStringSet-class.R +++ b/R/XStringSet-class.R @@ -624,15 +624,31 @@ setMethod("updateObject", "XStringSet", } ) +### Helper method to update general XStringSet objects efficiently +.updateObject_XStringSet <- function(object, ..., verbose=FALSE){ + baseclass <- xsbaseclass(object) + ### Update SharedRaw elements directly + ### Significantly fewer SharedRaw objects than XStrings, + ### So as long as we don't modify the order of the letters + ### this will be significantly faster + for(i in seq_along(object@pool)){ + shared <- object@pool[[i]] # SharedRaw object + xs <- new2(baseclass, shared=shared, length=length(shared), check=FALSE) + xs <- updateObject(xs, verbose=verbose) + object@pool[[i]] <- xs@shared + } + object +} + ### Update AAStringSet objects created before AA_ALPHABET was enforced ### for AAString objects setMethod("updateObject", "AAStringSet", function(object, ..., verbose=FALSE) { - for(i in seq_along(object)){ - object[[i]] <- updateObject(object[[i]]) - } - object + # Start by calling general XStringSet update function + object <- callNextMethod() + object <- compact(object) + .updateObject_XStringSet(object, ..., verbose=verbose) } ) diff --git a/R/XStringSetList-class.R b/R/XStringSetList-class.R index 3f617a96..5aa65cc2 100644 --- a/R/XStringSetList-class.R +++ b/R/XStringSetList-class.R @@ -270,21 +270,3 @@ setMethod("showAsCell", "XStringSetList", ### setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList) - - -### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### updateObject() -### - -### Update AAStringSetList objects created before AA_ALPHABET was enforced -### for AAString objects -setMethod("updateObject", "AAStringSetList", - function(object, ..., verbose=FALSE) - { - lst <- as.list(object) - for(i in seq_along(lst)){ - lst[[i]] <- updateObject(lst[[i]]) - } - AAStringSetList(lst) - } -)