Bioconductor · hpages · Jul 1, 2023 · Mar 28, 2023 · Mar 28, 2023 · Mar 29, 2023
diff --git a/R/XString-class.R b/R/XString-class.R
@@ -249,7 +249,7 @@ setMethod("XString", "AsIs",
     function(seqtype, x, start=NA, end=NA, width=NA)
     {
         if (!is.character(x))
-            stop("unsuported input type")
+            stop("unsupported input type")
         class(x) <- "character" # keeps the names (unlike as.character())
         XString(seqtype, x, start=start, end=end, width=width)
     }
@@ -474,10 +474,31 @@ setMethod("updateObject", "XString",
         ans_shared <- new("SharedRaw")
         ans_shared@xp <- xdata@xp
         [email protected][email protected]_to_cached_object
-        new(class(object),
+        new2(class(object),
             shared=ans_shared,
             offset=object@offset,
-            length=object@length)
+            length=object@length,
+            check=FALSE)
     }
 )
 
+### Update AAString objects created before AA_ALPHABET was enforced
+setMethod("updateObject", "AAString",
+    function(object, ..., verbose=FALSE)
+    {
+        # Start by calling general XString update function
+        object <- callNextMethod()
+
+        codec <- xscodec(AAString())
+        class(object) <- "BString"
+        mapping <- vapply(uniqueLetters(object), utf8ToInt, integer(1L))
+        missingVals <- is.na(codec@enc_lkup[mapping+1L])
+        if(any(missingVals)){
+            errorChars <- paste(names(mapping)[which(missingVals)],
+                                collapse=', ')
+            stop("Cannot decode, AAString contains invalid character(s): ",
+                  errorChars)
+        }
+        AAString(object)
+    }
+)
diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R
@@ -194,7 +194,34 @@ getRNAComplementLookup <- function()
 }
 
 
+
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-### Add extra codecs below...
+### The AA alphabet and codec.
+###
+
+### AAStrings don't need to support DNA/RNA or complementation,
+### so the code to generate the codec can be a lot simpler
+AAcodes <- function(baseOnly)
+{
+    if (!isTRUEorFALSE(baseOnly))
+        stop("'baseOnly' must be TRUE or FALSE")
+    letters <- AA_ALPHABET
+    if (baseOnly)
+        letters <- head(letters, n=20L)
+    setNames(.letterAsByteVal(letters), letters)
+}
 
+### AA codec.
+.XStringCodec.AA <- function(codes)
+{
+    letters <- names(codes)
+    extra_letters <- setdiff(tolower(letters), letters)
+    extra_codes <- .letterAsByteVal(toupper(extra_letters))
+    new("XStringCodec", letters, codes, extra_letters, extra_codes)
+}
+
+AA_LOOKUP_CODES <- AAcodes(FALSE)
+AA_STRING_CODEC <- .XStringCodec.AA(AA_LOOKUP_CODES)
 
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### Add extra codecs below...
diff --git a/R/XStringSet-class.R b/R/XStringSet-class.R
@@ -624,3 +624,31 @@ setMethod("updateObject", "XStringSet",
     }
 )
 
+### Helper method to update general XStringSet objects efficiently
+.updateObject_XStringSet <- function(object, ..., verbose=FALSE){
+    baseclass <- xsbaseclass(object)
+    ### Update SharedRaw elements directly
+    ### Significantly fewer SharedRaw objects than XStrings,
+    ### So as long as we don't modify the order of the letters
+    ### this will be significantly faster
+    for(i in seq_along(object@pool)){
+        shared <- object@pool[[i]] # SharedRaw object
+        xs <- new2(baseclass, shared=shared, length=length(shared), check=FALSE)
+        xs <- updateObject(xs, verbose=verbose)
+        object@pool[[i]] <- xs@shared
+    }
+    object
+}
+
+### Update AAStringSet objects created before AA_ALPHABET was enforced
+### for AAString objects
+setMethod("updateObject", "AAStringSet",
+    function(object, ..., verbose=FALSE)
+    {
+        # Start by calling general XStringSet update function
+        object <- callNextMethod()
+        object <- compact(object)
+        .updateObject_XStringSet(object, ..., verbose=verbose)
+    }
+)
+
diff --git a/R/XStringSetList-class.R b/R/XStringSetList-class.R
@@ -270,4 +270,3 @@ setMethod("showAsCell", "XStringSetList",
 ###
 
 setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList)
-
diff --git a/R/letterFrequency.R b/R/letterFrequency.R
@@ -131,14 +131,13 @@
     ans
 }
 
-.XString.amino_acid_frequency <- function(x, as.prob)
+.XString.amino_acid_frequency <- function(x, as.prob, baseOnly)
 {
     if (!isTRUEorFALSE(as.prob))
         stop("'as.prob' must be TRUE or FALSE")
-    codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse="")))
-    names(codes) <- AA_ALPHABET    
+    codes <- xscodes(x, baseOnly=baseOnly)
     ans <- .Call2("XString_letter_frequency",
-                 x, codes, TRUE,
+                 x, codes, baseOnly,
                  PACKAGE="Biostrings")
     if (as.prob)
         ans <- ans / nchar(x) # nchar(x) is sum(ans) but faster
@@ -180,15 +179,14 @@
     ans
 }
 
-.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse)
+.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse, baseOnly)
 {
     if (!isTRUEorFALSE(as.prob))
         stop("'as.prob' must be TRUE or FALSE")
     collapse <- .normargCollapse(collapse)
-    codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse="")))
-    names(codes) <- AA_ALPHABET
+    codes <- xscodes(x, baseOnly=baseOnly)
     ans <- .Call2("XStringSet_letter_frequency",
-                 x, collapse, codes, TRUE,
+                 x, collapse, codes, baseOnly,
                  PACKAGE="Biostrings")
     if (as.prob) {
         if (collapse)
@@ -219,8 +217,8 @@ setMethod("alphabetFrequency", "RNAString",
 )
 
 setMethod("alphabetFrequency", "AAString",
-    function(x, as.prob=FALSE)
-        .XString.amino_acid_frequency(x, as.prob)
+    function(x, as.prob=FALSE, baseOnly=FALSE)
+        .XString.amino_acid_frequency(x, as.prob, baseOnly)
 )
 
 setMethod("alphabetFrequency", "XStringSet",
@@ -239,8 +237,8 @@ setMethod("alphabetFrequency", "RNAStringSet",
 )
 
 setMethod("alphabetFrequency", "AAStringSet",
-    function(x, as.prob=FALSE, collapse=FALSE)
-        .XStringSet.amino_acid_frequency(x, as.prob, collapse)
+    function(x, as.prob=FALSE, collapse=FALSE, baseOnly=FALSE)
+        .XStringSet.amino_acid_frequency(x, as.prob, collapse, baseOnly)
 )
 
 ### library(drosophila2probe)
@@ -281,6 +279,11 @@ setMethod("hasOnlyBaseLetters", "RNAString",
     function(x) hasOnlyBaseLetters(DNAString(x))
 )
 
+setMethod("hasOnlyBaseLetters", "AAString",
+    function(x)
+        alphabetFrequency(x, baseOnly=TRUE)[["other"]] == 0L
+)
+
 setMethod("hasOnlyBaseLetters", "DNAStringSet",
     function(x)
         alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L
@@ -290,6 +293,11 @@ setMethod("hasOnlyBaseLetters", "RNAStringSet",
     function(x) hasOnlyBaseLetters(DNAStringSet(x))
 )
 
+setMethod("hasOnlyBaseLetters", "AAStringSet",
+    function(x)
+        alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L
+)
+
 setMethod("hasOnlyBaseLetters", "XStringViews",
     function(x)
     {

diff --git a/R/seqtype.R b/R/seqtype.R
@@ -11,7 +11,7 @@
 ###   "B"       | general purpose string(s) | bytes 0-255  | no
 ###   "DNA"     | DNA sequence(s)           | DNA_ALPHABET | yes
 ###   "RNA"     | RNA sequence(s)           | RNA_ALPHABET | yes
-###   "AA"      | amino acid sequence(s)    | AA_ALPHABET  | no
+###   "AA"      | amino acid sequence(s)    | AA_ALPHABET  | yes
 ###
 ### seqtype() returns that sequence type. For example 'seqtype(AAString())'
 ### returns "AA".
@@ -25,7 +25,7 @@
 ### string containers: XString (single sequence), XStringSet (multiple
 ### sequences), XStringViews (multiple sequences) and MaskedXString (single
 ### sequence).
-###   
+###
 
 ### Exported.
 setGeneric("seqtype", function(x) standardGeneric("seqtype"))
@@ -35,11 +35,10 @@ setGeneric("seqtype<-", signature="x",
     function(x, value) standardGeneric("seqtype<-")
 )
 
-
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 ### Helper functions for which the returned value depends on 'seqtype(x)',
 ### not on what particular data are in 'x'. Not exported.
-### 
+###
 
 xsbaseclass <- function(x) paste(seqtype(x), "String", sep="")
 
@@ -56,6 +55,7 @@ setMethod("xscodes", "ANY",
         switch(seqtype,
                DNA=DNAcodes(baseOnly),
                RNA=RNAcodes(baseOnly),
+               AA=AAcodes(baseOnly),
                0:255
         )
     }
@@ -66,6 +66,7 @@ xscodec <- function(x)
     switch(seqtype(x),
         DNA=DNA_STRING_CODEC,
         RNA=RNA_STRING_CODEC,
+        AA=AA_STRING_CODEC,
         NULL
     )
 }
@@ -109,18 +110,22 @@ get_seqtype_conversion_lookup <- function(from_seqtype, to_seqtype)
     if (!compatible_seqtypes(from_seqtype, to_seqtype))
         stop("incompatible sequence types \"",
              from_seqtype, "\" and \"", to_seqtype, "\"")
-    from_nucleo <- from_seqtype %in% c("DNA", "RNA")
-    to_nucleo <- to_seqtype %in% c("DNA", "RNA")
+    from_nucleo <- from_seqtype %in% c("DNA", "RNA", "AA")
+    to_nucleo <- to_seqtype %in% c("DNA", "RNA", "AA")
     if (from_nucleo == to_nucleo)
         return(NULL)
     if (to_seqtype == "DNA")
         return(DNA_STRING_CODEC@enc_lkup)
     if (to_seqtype == "RNA")
         return(RNA_STRING_CODEC@enc_lkup)
+    if (to_seqtype == "AA")
+        return(AA_STRING_CODEC@enc_lkup)
     if (from_seqtype == "DNA")
         return(DNA_STRING_CODEC@dec_lkup)
     if (from_seqtype == "RNA")
         return(RNA_STRING_CODEC@dec_lkup)
+    if (from_seqtype == "AA")
+        return(AA_STRING_CODEC@dec_lkup)
     stop("Biostrings internal error, please report") # should never happen
 }
 
@@ -155,4 +160,3 @@ setMethod("alphabet", "ANY",
         )
     }
 )
-
diff --git a/R/zzz.R b/R/zzz.R
@@ -8,6 +8,9 @@
     .Call2("init_RNAlkups",
           RNA_STRING_CODEC@enc_lkup, RNA_STRING_CODEC@dec_lkup,
           PACKAGE=pkgname)
+    .Call2("init_AAlkups",
+          AA_STRING_CODEC@enc_lkup, AA_STRING_CODEC@dec_lkup,
+          PACKAGE=pkgname)
     DNA_AND_RNA_COLORED_LETTERS <<- make_DNA_AND_RNA_COLORED_LETTERS()
     AA_COLORED_LETTERS <<- make_AA_COLORED_LETTERS()
     option_name <- "Biostrings.coloring"

diff --git a/inst/include/Biostrings_interface.h b/inst/include/Biostrings_interface.h
@@ -128,6 +128,10 @@ char RNAencode(char c);
 
 char RNAdecode(char code);
 
+char AAencode(char c);
+
+char AAdecode(char code);
+
 /*
  * Low-level manipulation of XStringSet objects.
  * (see XStringSet_class.c)

diff --git a/inst/include/_Biostrings_stubs.c b/inst/include/_Biostrings_stubs.c
@@ -51,6 +51,16 @@ DEFINE_CCALLABLE_STUB(char, RNAdecode,
 	(     code)
 )
 
+DEFINE_CCALLABLE_STUB(char, AAencode,
+	(char c),
+	(     c)
+)
+
+DEFINE_CCALLABLE_STUB(char, AAdecode,
+	(char code),
+	(     code)
+)
+
 /*
  * Stubs for callables defined in XStringSet_class.c
  */

diff --git a/man/XStringSet-io.Rd b/man/XStringSet-io.Rd
@@ -137,9 +137,8 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
       \item \code{"RNA"} for RNA sequences i.e. only letters in
             \code{\link{RNA_ALPHABET}} (case ignored) are valid
             one-letter sequence codes.
-      \item \code{"AA"} for Amino Acid sequences. Currently treated as
-            \code{"B"} but this will change in the near future i.e. only
-            letters in \code{\link{AA_ALPHABET}} (case ignored) will be
+      \item \code{"AA"} for Amino Acid sequences i.e. only
+            letters in \code{\link{AA_ALPHABET}} (case ignored) are
             valid one-letter sequence codes.
     }
     Invalid one-letter sequence codes are ignored with a warning.
@@ -192,7 +191,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
   \item{save.dups}{
     \code{TRUE} or \code{FALSE}.
     If \code{TRUE} then the \code{\link[IRanges:Grouping-class]{Dups}}
-    object describing 
+    object describing
     how duplicated elements in \code{x} are related to each other is
     saved too. For advanced users only.
   }
@@ -210,7 +209,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
   load sequences from an input file (or multiple input files) into an
   \link{XStringSet} object. When multiple input files are specified,
   all must have the same format (i.e. FASTA or FASTQ) and files with
-  different compression types can be mixed with non-compressed files. 
+  different compression types can be mixed with non-compressed files.
   The files are read in the order they were specified and the sequences
   are stored in the returned object in the order they were read.
 

diff --git a/man/letterFrequency.Rd b/man/letterFrequency.Rd
@@ -92,7 +92,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
 \S4method{consensusString}{matrix}(x, ambiguityMap="?", threshold=0.5)
 \S4method{consensusString}{DNAStringSet}(x, ambiguityMap=IUPAC_CODE_MAP,
              threshold=0.25, shift=0L, width=NULL)
-\S4method{consensusString}{RNAStringSet}(x, 
+\S4method{consensusString}{RNAStringSet}(x,
              ambiguityMap=
              structure(as.character(RNAStringSet(DNAStringSet(IUPAC_CODE_MAP))),
                        names=
@@ -210,7 +210,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
     those sequences have been shifted (see the \code{shift} argument above).
     This ensures that any wider consensus matrix would be a "padded with zeros"
     version of the matrix returned when \code{width=NULL}.
-    
+
     The length of the returned sequence for the \code{consensusString}
     method for \link{XStringSet} objects.
   }
@@ -255,10 +255,12 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
   \link{XStringSet} or \link{XStringViews} object, then it returns
   an integer matrix with \code{length(x)} rows where the
   \code{i}-th row contains the frequencies for \code{x[[i]]}.
-  If \code{x} is a DNA or RNA input, then the returned vector is named
+  If \code{x} is a DNA, RNA, or AA input, then the returned vector is named
   with the letters in the alphabet. If the \code{baseOnly} argument is
-  \code{TRUE}, then the returned vector has only 5 elements: 4 elements
-  corresponding to the 4 nucleotides + the 'other' element.
+  \code{TRUE}, then the returned vector has only 5 elements for DNA/RNA input
+  (4 elements corresponding to the 4 nucleotides + the 'other' element) and
+  21 elements for AA input (20 elements corresponding to the 20 base amino acids
+  + the 'other' element).
 
   \code{letterFrequency} returns, similarly, an integer vector or matrix,
   but restricted and/or collated according to \code{letters} and \code{OR}.
@@ -270,7 +272,8 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
 
   \code{hasOnlyBaseLetters} returns \code{TRUE} or \code{FALSE} indicating
   whether or not \code{x} contains only base letters (i.e. As, Cs, Gs and Ts
-  for DNA input and As, Cs, Gs and Us for RNA input).
+  for DNA input, As, Cs, Gs and Us for RNA input, or any of the 20 standard
+  amino acids for AA input).
 
   \code{uniqueLetters} returns a vector of 1-letter or empty strings. The empty
   string is used to represent the nul character if \code{x} happens to contain
@@ -391,8 +394,8 @@ consensusString(sort(probes)[1:5], ambiguityMap = "N", threshold = 0.5)
 ## Consensus involving ambiguity letters in the input strings
 consensusString(DNAStringSet(c("NNNN","ACTG")))
 consensusString(DNAStringSet(c("AANN","ACTG")))
-consensusString(DNAStringSet(c("ACAG","ACAR"))) 
-consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG"))) 
+consensusString(DNAStringSet(c("ACAG","ACAR")))
+consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG")))
 
 ## ---------------------------------------------------------------------
 ## C. RELATIONSHIP BETWEEN consensusMatrix() AND coverage()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -270,4 +270,3 @@ setMethod("showAsCell", "XStringSetList",
		###

		setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList)