From 0985c7f19f0d4571e21e4889d11e4be9577aa146 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Tue, 28 Mar 2023 11:41:50 -0400
Subject: [PATCH 1/9] fix: AAStrings now enforce character set

---
 DESCRIPTION             |  4 ++--
 R/XString-class.R       |  2 +-
 R/XStringCodec-class.R  | 28 +++++++++++++++++++++++++++-
 R/seqtype.R             | 18 +++++++++++-------
 R/zzz.R                 |  3 +++
 src/Biostrings.h        |  7 +++++++
 src/R_init_Biostrings.c |  3 +++
 src/XString_class.c     | 33 ++++++++++++++++++++++++++++++++-
 8 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 8ece12da..e0b7f0fc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -19,12 +19,12 @@ Authors@R: c(
 	person("Vince", "Carey", role="ctb"),
 	person("Nicolas", "Delhomme", role="ctb"),
 	person("Felix", "Ernst", role="ctb"),
-	person("Aidan", "Lakshman", role="ctb"))
+	person("Aidan", "Lakshman", role="ctb"),
 	person("Kieran", "O'Neill", role="ctb"),
 	person("Valerie", "Obenchain", role="ctb"),
 	person("Marcel", "Ramos", role="ctb"),
 	person("Albert", "Vill", role="ctb"),
-	person("Eric", "Wright", role="ctb"),
+	person("Erik", "Wright", role="ctb"))
 Depends: R (>= 4.0.0), methods, BiocGenerics (>= 0.37.0),
 	S4Vectors (>= 0.27.12), IRanges (>= 2.31.2), XVector (>= 0.37.1),
 	GenomeInfoDb
diff --git a/R/XString-class.R b/R/XString-class.R
index c3761748..5a8e0828 100644
--- a/R/XString-class.R
+++ b/R/XString-class.R
@@ -249,7 +249,7 @@ setMethod("XString", "AsIs",
     function(seqtype, x, start=NA, end=NA, width=NA)
     {
         if (!is.character(x))
-            stop("unsuported input type")
+            stop("unsupported input type")
         class(x) <- "character" # keeps the names (unlike as.character())
         XString(seqtype, x, start=start, end=end, width=width)
     }
diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R
index 5e4dbee1..45b2b368 100644
--- a/R/XStringCodec-class.R
+++ b/R/XStringCodec-class.R
@@ -194,7 +194,33 @@ getRNAComplementLookup <- function()
 }
 
 
+
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-### Add extra codecs below...
+### The AA alphabet and codec.
+###
+
+### AAStrings don't need to support DNA/RNA or complementation,
+### so the code to generate the codec can be a lot simpler
+AA_LOOKUP_CODES <- c(A=1L, R=2L, N=3L, D=4L,
+                        C=5L, Q=6L, E=7L, G=8L,
+                        H=9L, I=10L, L=11L, K=12L,
+                        M=13L, F=14L, P=15L, S=16L,
+                        T=17L, W=18L, Y=19L, V=20L,
+                        U=21L, O=22L,
+                        B=23L, J=24L, Z=25L, X=26L,
+                        `*`=27L, `-`=28L, `+`=29L, `.`=30L)
+
+### AA codec.
+.XStringCodec.AA <- function(codes)
+{
+    letters <- names(codes)
+    extra_letters <- setdiff(tolower(letters), letters)
+    extra_codes <- codes[toupper(extra_letters)]
+    new("XStringCodec", letters, codes, extra_letters, extra_codes)
+}
 
+AAcodes <- function(baseOnly) AA_LOOKUP_CODES[seq_len(ifelse(baseOnly, 20, 30))]
+AA_STRING_CODEC <- .XStringCodec.AA(AA_LOOKUP_CODES)
 
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### Add extra codecs below...
diff --git a/R/seqtype.R b/R/seqtype.R
index 9958cc08..6af2c231 100644
--- a/R/seqtype.R
+++ b/R/seqtype.R
@@ -11,7 +11,7 @@
 ###   "B"       | general purpose string(s) | bytes 0-255  | no
 ###   "DNA"     | DNA sequence(s)           | DNA_ALPHABET | yes
 ###   "RNA"     | RNA sequence(s)           | RNA_ALPHABET | yes
-###   "AA"      | amino acid sequence(s)    | AA_ALPHABET  | no
+###   "AA"      | amino acid sequence(s)    | AA_ALPHABET  | yes
 ###
 ### seqtype() returns that sequence type. For example 'seqtype(AAString())'
 ### returns "AA".
@@ -25,7 +25,7 @@
 ### string containers: XString (single sequence), XStringSet (multiple
 ### sequences), XStringViews (multiple sequences) and MaskedXString (single
 ### sequence).
-###   
+###
 
 ### Exported.
 setGeneric("seqtype", function(x) standardGeneric("seqtype"))
@@ -35,11 +35,10 @@ setGeneric("seqtype<-", signature="x",
     function(x, value) standardGeneric("seqtype<-")
 )
 
-
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 ### Helper functions for which the returned value depends on 'seqtype(x)',
 ### not on what particular data are in 'x'. Not exported.
-### 
+###
 
 xsbaseclass <- function(x) paste(seqtype(x), "String", sep="")
 
@@ -56,6 +55,7 @@ setMethod("xscodes", "ANY",
         switch(seqtype,
                DNA=DNAcodes(baseOnly),
                RNA=RNAcodes(baseOnly),
+               AA=AAcodes(baseOnly),
                0:255
         )
     }
@@ -66,6 +66,7 @@ xscodec <- function(x)
     switch(seqtype(x),
         DNA=DNA_STRING_CODEC,
         RNA=RNA_STRING_CODEC,
+        AA=AA_STRING_CODEC,
         NULL
     )
 }
@@ -109,18 +110,22 @@ get_seqtype_conversion_lookup <- function(from_seqtype, to_seqtype)
     if (!compatible_seqtypes(from_seqtype, to_seqtype))
         stop("incompatible sequence types \"",
              from_seqtype, "\" and \"", to_seqtype, "\"")
-    from_nucleo <- from_seqtype %in% c("DNA", "RNA")
-    to_nucleo <- to_seqtype %in% c("DNA", "RNA")
+    from_nucleo <- from_seqtype %in% c("DNA", "RNA", "AA")
+    to_nucleo <- to_seqtype %in% c("DNA", "RNA", "AA")
     if (from_nucleo == to_nucleo)
         return(NULL)
     if (to_seqtype == "DNA")
         return(DNA_STRING_CODEC@enc_lkup)
     if (to_seqtype == "RNA")
         return(RNA_STRING_CODEC@enc_lkup)
+    if (to_seqtype == "AA")
+        return(AA_STRING_CODEC@enc_lkup)
     if (from_seqtype == "DNA")
         return(DNA_STRING_CODEC@dec_lkup)
     if (from_seqtype == "RNA")
         return(RNA_STRING_CODEC@dec_lkup)
+    if (from_seqtype == "AA")
+        return(AA_STRING_CODEC@dec_lkup)
     stop("Biostrings internal error, please report") # should never happen
 }
 
@@ -155,4 +160,3 @@ setMethod("alphabet", "ANY",
         )
     }
 )
-
diff --git a/R/zzz.R b/R/zzz.R
index fb91302d..65e22402 100644
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -8,6 +8,9 @@
     .Call2("init_RNAlkups",
           RNA_STRING_CODEC@enc_lkup, RNA_STRING_CODEC@dec_lkup,
           PACKAGE=pkgname)
+    .Call2("init_AAlkups",
+          AA_STRING_CODEC@enc_lkup, AA_STRING_CODEC@dec_lkup,
+          PACKAGE=pkgname)
     DNA_AND_RNA_COLORED_LETTERS <<- make_DNA_AND_RNA_COLORED_LETTERS()
     AA_COLORED_LETTERS <<- make_AA_COLORED_LETTERS()
     option_name <- "Biostrings.coloring"
diff --git a/src/Biostrings.h b/src/Biostrings.h
index 313089a4..c5f211ac 100644
--- a/src/Biostrings.h
+++ b/src/Biostrings.h
@@ -78,6 +78,13 @@ char _RNAencode(char c);
 
 char _RNAdecode(char code);
 
+SEXP init_AAlkups(SEXP enc_lkup, SEXP dec_lkup);
+
+char _AAencode(char c);
+
+char _AAdecode(char code);
+
+
 void _copy_CHARSXP_to_Chars_holder(
 	Chars_holder *dest,
 	SEXP src,
diff --git a/src/R_init_Biostrings.c b/src/R_init_Biostrings.c
index 9f11703d..c0433e4b 100644
--- a/src/R_init_Biostrings.c
+++ b/src/R_init_Biostrings.c
@@ -29,6 +29,7 @@ static const R_CallMethodDef callMethods[] = {
 /* XString_class.c */
 	CALLMETHOD_DEF(init_DNAlkups, 2),
 	CALLMETHOD_DEF(init_RNAlkups, 2),
+	CALLMETHOD_DEF(init_AAlkups, 2),
 	CALLMETHOD_DEF(new_XString_from_CHARACTER, 5),
 
 /* XStringSet_class.c */
@@ -173,6 +174,8 @@ void R_init_Biostrings(DllInfo *info)
 	REGISTER_CCALLABLE(_DNAdecode);
 	REGISTER_CCALLABLE(_RNAencode);
 	REGISTER_CCALLABLE(_RNAdecode);
+	REGISTER_CCALLABLE(_AAencode);
+	REGISTER_CCALLABLE(_AAdecode);
 
 /* XStringSet_class.c */
 	REGISTER_CCALLABLE(_get_XStringSet_length);
diff --git a/src/XString_class.c b/src/XString_class.c
index bbe0124a..8bf7cf30 100644
--- a/src/XString_class.c
+++ b/src/XString_class.c
@@ -14,7 +14,8 @@
  */
 
 static ByteTrTable DNA_enc_byte2code, DNA_dec_byte2code,
-		   RNA_enc_byte2code, RNA_dec_byte2code;
+		   RNA_enc_byte2code, RNA_dec_byte2code,
+		   AA_enc_byte2code, AA_dec_byte2code;
 
 const ByteTrTable *get_enc_byte2code(const char *classname)
 {
@@ -95,6 +96,36 @@ char _RNAdecode(char code)
 }
 
 
+/* --- .Call ENTRY POINT --- */
+SEXP init_AAlkups(SEXP enc_lkup, SEXP dec_lkup)
+{
+	_init_ByteTrTable_with_lkup(&AA_enc_byte2code, enc_lkup);
+	_init_ByteTrTable_with_lkup(&AA_dec_byte2code, dec_lkup);
+	return R_NilValue;
+}
+
+char _AAencode(char c)
+{
+	int code;
+
+	code = AA_enc_byte2code.byte2code[(unsigned char) c];
+	if (code == NA_INTEGER)
+		error("_AAencode(): invalid AAString "
+		      "input character: '%c' (byte value %d)", c, (int) c);
+	return code;
+}
+
+char _AAdecode(char code)
+{
+	int c;
+
+	c = AA_dec_byte2code.byte2code[(unsigned char) code];
+	if (c == NA_INTEGER)
+		error("_AAdecode(): invalid AAString "
+		      "internal code: %d", (int) code);
+	return (char) c;
+}
+
 /****************************************************************************
  * From CHARACTER to XString and vice-versa.
  */

From 8eb59876f25e8d08354471a4e59a31d15ea5014a Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Tue, 28 Mar 2023 11:58:01 -0400
Subject: [PATCH 2/9] fix: formatting and documentation update

---
 man/XStringSet-io.Rd | 9 ++++-----
 src/Biostrings.h     | 1 -
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/man/XStringSet-io.Rd b/man/XStringSet-io.Rd
index 71475bb3..1f389842 100644
--- a/man/XStringSet-io.Rd
+++ b/man/XStringSet-io.Rd
@@ -137,9 +137,8 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
       \item \code{"RNA"} for RNA sequences i.e. only letters in
             \code{\link{RNA_ALPHABET}} (case ignored) are valid
             one-letter sequence codes.
-      \item \code{"AA"} for Amino Acid sequences. Currently treated as
-            \code{"B"} but this will change in the near future i.e. only
-            letters in \code{\link{AA_ALPHABET}} (case ignored) will be
+      \item \code{"AA"} for Amino Acid sequences i.e. only
+            letters in \code{\link{AA_ALPHABET}} (case ignored) are
             valid one-letter sequence codes.
     }
     Invalid one-letter sequence codes are ignored with a warning.
@@ -192,7 +191,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
   \item{save.dups}{
     \code{TRUE} or \code{FALSE}.
     If \code{TRUE} then the \code{\link[IRanges:Grouping-class]{Dups}}
-    object describing 
+    object describing
     how duplicated elements in \code{x} are related to each other is
     saved too. For advanced users only.
   }
@@ -210,7 +209,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
   load sequences from an input file (or multiple input files) into an
   \link{XStringSet} object. When multiple input files are specified,
   all must have the same format (i.e. FASTA or FASTQ) and files with
-  different compression types can be mixed with non-compressed files. 
+  different compression types can be mixed with non-compressed files.
   The files are read in the order they were specified and the sequences
   are stored in the returned object in the order they were read.
 
diff --git a/src/Biostrings.h b/src/Biostrings.h
index c5f211ac..0cca89e6 100644
--- a/src/Biostrings.h
+++ b/src/Biostrings.h
@@ -84,7 +84,6 @@ char _AAencode(char c);
 
 char _AAdecode(char code);
 
-
 void _copy_CHARSXP_to_Chars_holder(
 	Chars_holder *dest,
 	SEXP src,

From ac715df0313194eb1005f24c9d001498e0a3dbb5 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Wed, 29 Mar 2023 17:22:46 -0400
Subject: [PATCH 3/9] remove encode/decode for simpler version

---
 R/XStringCodec-class.R | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/R/XStringCodec-class.R b/R/XStringCodec-class.R
index 45b2b368..636633a6 100644
--- a/R/XStringCodec-class.R
+++ b/R/XStringCodec-class.R
@@ -201,25 +201,26 @@ getRNAComplementLookup <- function()
 
 ### AAStrings don't need to support DNA/RNA or complementation,
 ### so the code to generate the codec can be a lot simpler
-AA_LOOKUP_CODES <- c(A=1L, R=2L, N=3L, D=4L,
-                        C=5L, Q=6L, E=7L, G=8L,
-                        H=9L, I=10L, L=11L, K=12L,
-                        M=13L, F=14L, P=15L, S=16L,
-                        T=17L, W=18L, Y=19L, V=20L,
-                        U=21L, O=22L,
-                        B=23L, J=24L, Z=25L, X=26L,
-                        `*`=27L, `-`=28L, `+`=29L, `.`=30L)
+AAcodes <- function(baseOnly)
+{
+    if (!isTRUEorFALSE(baseOnly))
+        stop("'baseOnly' must be TRUE or FALSE")
+    letters <- AA_ALPHABET
+    if (baseOnly)
+        letters <- head(letters, n=20L)
+    setNames(.letterAsByteVal(letters), letters)
+}
 
 ### AA codec.
 .XStringCodec.AA <- function(codes)
 {
     letters <- names(codes)
     extra_letters <- setdiff(tolower(letters), letters)
-    extra_codes <- codes[toupper(extra_letters)]
+    extra_codes <- .letterAsByteVal(toupper(extra_letters))
     new("XStringCodec", letters, codes, extra_letters, extra_codes)
 }
 
-AAcodes <- function(baseOnly) AA_LOOKUP_CODES[seq_len(ifelse(baseOnly, 20, 30))]
+AA_LOOKUP_CODES <- AAcodes(FALSE)
 AA_STRING_CODEC <- .XStringCodec.AA(AA_LOOKUP_CODES)
 
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

From a77e0c60e9a556667b829d2337396f508b042ee8 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Thu, 30 Mar 2023 08:41:48 -0400
Subject: [PATCH 4/9] adds C routines to Biostrings_interface and
 _Biostrings_stubs

---
 inst/include/Biostrings_interface.h |  4 ++++
 inst/include/_Biostrings_stubs.c    | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/inst/include/Biostrings_interface.h b/inst/include/Biostrings_interface.h
index dc8000c8..e119bc55 100644
--- a/inst/include/Biostrings_interface.h
+++ b/inst/include/Biostrings_interface.h
@@ -128,6 +128,10 @@ char RNAencode(char c);
 
 char RNAdecode(char code);
 
+char AAencode(char c);
+
+char AAdecode(char code);
+
 /*
  * Low-level manipulation of XStringSet objects.
  * (see XStringSet_class.c)
diff --git a/inst/include/_Biostrings_stubs.c b/inst/include/_Biostrings_stubs.c
index 3cfb9a15..86e72fb3 100644
--- a/inst/include/_Biostrings_stubs.c
+++ b/inst/include/_Biostrings_stubs.c
@@ -51,6 +51,16 @@ DEFINE_CCALLABLE_STUB(char, RNAdecode,
 	(     code)
 )
 
+DEFINE_CCALLABLE_STUB(char, AAencode,
+	(char c),
+	(     c)
+)
+
+DEFINE_CCALLABLE_STUB(char, AAdecode,
+	(char code),
+	(     code)
+)
+
 /*
  * Stubs for callables defined in XStringSet_class.c
  */

From 35d7ca9ac94330382c0db0ad42c36df0edb93d36 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Wed, 12 Apr 2023 09:21:09 -0400
Subject: [PATCH 5/9] fix: `baseOnly` argument for AAString and AAStringSet

---
 R/letterFrequency.R | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/R/letterFrequency.R b/R/letterFrequency.R
index 36f2fe89..65e8cd78 100644
--- a/R/letterFrequency.R
+++ b/R/letterFrequency.R
@@ -131,14 +131,13 @@
     ans
 }
 
-.XString.amino_acid_frequency <- function(x, as.prob)
+.XString.amino_acid_frequency <- function(x, as.prob, baseOnly)
 {
     if (!isTRUEorFALSE(as.prob))
         stop("'as.prob' must be TRUE or FALSE")
-    codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse="")))
-    names(codes) <- AA_ALPHABET    
+    codes <- xscodes(x, baseOnly=baseOnly)
     ans <- .Call2("XString_letter_frequency",
-                 x, codes, TRUE,
+                 x, codes, baseOnly,
                  PACKAGE="Biostrings")
     if (as.prob)
         ans <- ans / nchar(x) # nchar(x) is sum(ans) but faster
@@ -180,15 +179,14 @@
     ans
 }
 
-.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse)
+.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse, baseOnly)
 {
     if (!isTRUEorFALSE(as.prob))
         stop("'as.prob' must be TRUE or FALSE")
     collapse <- .normargCollapse(collapse)
-    codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse="")))
-    names(codes) <- AA_ALPHABET
+    codes <- xscodes(x, baseOnly=baseOnly)
     ans <- .Call2("XStringSet_letter_frequency",
-                 x, collapse, codes, TRUE,
+                 x, collapse, codes, baseOnly,
                  PACKAGE="Biostrings")
     if (as.prob) {
         if (collapse)
@@ -219,8 +217,8 @@ setMethod("alphabetFrequency", "RNAString",
 )
 
 setMethod("alphabetFrequency", "AAString",
-    function(x, as.prob=FALSE)
-        .XString.amino_acid_frequency(x, as.prob)
+    function(x, as.prob=FALSE, baseOnly=FALSE)
+        .XString.amino_acid_frequency(x, as.prob, baseOnly)
 )
 
 setMethod("alphabetFrequency", "XStringSet",
@@ -239,8 +237,8 @@ setMethod("alphabetFrequency", "RNAStringSet",
 )
 
 setMethod("alphabetFrequency", "AAStringSet",
-    function(x, as.prob=FALSE, collapse=FALSE)
-        .XStringSet.amino_acid_frequency(x, as.prob, collapse)
+    function(x, as.prob=FALSE, collapse=FALSE, baseOnly=FALSE)
+        .XStringSet.amino_acid_frequency(x, as.prob, collapse, baseOnly)
 )
 
 ### library(drosophila2probe)

From 6f8fb36e5e15d304908e7b0e9c6367f18f8490e3 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Wed, 12 Apr 2023 09:29:32 -0400
Subject: [PATCH 6/9] fix: adds methods for hasOnlyBaseLetters for AAString and
 AAStringSet, updates man page

---
 R/letterFrequency.R    | 10 ++++++++++
 man/letterFrequency.Rd | 19 +++++++++++--------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/R/letterFrequency.R b/R/letterFrequency.R
index 65e8cd78..39d754e8 100644
--- a/R/letterFrequency.R
+++ b/R/letterFrequency.R
@@ -279,6 +279,11 @@ setMethod("hasOnlyBaseLetters", "RNAString",
     function(x) hasOnlyBaseLetters(DNAString(x))
 )
 
+setMethod("hasOnlyBaseLetters", "AAString",
+    function(x)
+        alphabetFrequency(x, baseOnly=TRUE)[["other"]] == 0L
+)
+
 setMethod("hasOnlyBaseLetters", "DNAStringSet",
     function(x)
         alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L
@@ -288,6 +293,11 @@ setMethod("hasOnlyBaseLetters", "RNAStringSet",
     function(x) hasOnlyBaseLetters(DNAStringSet(x))
 )
 
+setMethod("hasOnlyBaseLetters", "AAStringSet",
+    function(x)
+        alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L
+)
+
 setMethod("hasOnlyBaseLetters", "XStringViews",
     function(x)
     {
diff --git a/man/letterFrequency.Rd b/man/letterFrequency.Rd
index ab2cdce9..6e94befa 100644
--- a/man/letterFrequency.Rd
+++ b/man/letterFrequency.Rd
@@ -92,7 +92,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
 \S4method{consensusString}{matrix}(x, ambiguityMap="?", threshold=0.5)
 \S4method{consensusString}{DNAStringSet}(x, ambiguityMap=IUPAC_CODE_MAP,
              threshold=0.25, shift=0L, width=NULL)
-\S4method{consensusString}{RNAStringSet}(x, 
+\S4method{consensusString}{RNAStringSet}(x,
              ambiguityMap=
              structure(as.character(RNAStringSet(DNAStringSet(IUPAC_CODE_MAP))),
                        names=
@@ -210,7 +210,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
     those sequences have been shifted (see the \code{shift} argument above).
     This ensures that any wider consensus matrix would be a "padded with zeros"
     version of the matrix returned when \code{width=NULL}.
-    
+
     The length of the returned sequence for the \code{consensusString}
     method for \link{XStringSet} objects.
   }
@@ -255,10 +255,12 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
   \link{XStringSet} or \link{XStringViews} object, then it returns
   an integer matrix with \code{length(x)} rows where the
   \code{i}-th row contains the frequencies for \code{x[[i]]}.
-  If \code{x} is a DNA or RNA input, then the returned vector is named
+  If \code{x} is a DNA, RNA, or AA input, then the returned vector is named
   with the letters in the alphabet. If the \code{baseOnly} argument is
-  \code{TRUE}, then the returned vector has only 5 elements: 4 elements
-  corresponding to the 4 nucleotides + the 'other' element.
+  \code{TRUE}, then the returned vector has only 5 elements for DNA/RNA input
+  (4 elements corresponding to the 4 nucleotides + the 'other' element) and
+  21 elements for AA input (20 elements corresponding to the 20 base amino acids
+  + the 'other' element).
 
   \code{letterFrequency} returns, similarly, an integer vector or matrix,
   but restricted and/or collated according to \code{letters} and \code{OR}.
@@ -270,7 +272,8 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
 
   \code{hasOnlyBaseLetters} returns \code{TRUE} or \code{FALSE} indicating
   whether or not \code{x} contains only base letters (i.e. As, Cs, Gs and Ts
-  for DNA input and As, Cs, Gs and Us for RNA input).
+  for DNA input, As, Cs, Gs and Us for RNA input, or any of the 20 standard
+  amino acids for AA input).
 
   \code{uniqueLetters} returns a vector of 1-letter or empty strings. The empty
   string is used to represent the nul character if \code{x} happens to contain
@@ -391,8 +394,8 @@ consensusString(sort(probes)[1:5], ambiguityMap = "N", threshold = 0.5)
 ## Consensus involving ambiguity letters in the input strings
 consensusString(DNAStringSet(c("NNNN","ACTG")))
 consensusString(DNAStringSet(c("AANN","ACTG")))
-consensusString(DNAStringSet(c("ACAG","ACAR"))) 
-consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG"))) 
+consensusString(DNAStringSet(c("ACAG","ACAR")))
+consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG")))
 
 ## ---------------------------------------------------------------------
 ## C. RELATIONSHIP BETWEEN consensusMatrix() AND coverage()

From ef1269fa659c9d0252d3dbfd39f72d0d09c25312 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Wed, 12 Apr 2023 10:18:00 -0400
Subject: [PATCH 7/9] adds updateObject methods for AAString and extensions

---
 R/XString-class.R        | 16 ++++++++++++++++
 R/XStringSet-class.R     | 10 ++++++++++
 R/XStringSetList-class.R | 10 ++++++++++
 3 files changed, 36 insertions(+)

diff --git a/R/XString-class.R b/R/XString-class.R
index 5a8e0828..8567ebca 100644
--- a/R/XString-class.R
+++ b/R/XString-class.R
@@ -481,3 +481,19 @@ setMethod("updateObject", "XString",
     }
 )
 
+setMethod("updateObject", "AAString",
+    function(object, ..., verbose=FALSE)
+    {
+        codec <- xscodec(AAString())
+        class(object) <- "BString"
+        mapping <- vapply(uniqueLetters(object), utf8ToInt, integer(1L))
+        missingVals <- is.na(codec@enc_lkup[mapping+1L])
+        if(any(missingVals)){
+            errorChars <- paste(names(mapping)[which(missingVals)],
+                                collapse=', ')
+            stop("Cannot decode, AAString contains invalid character(s): ",
+                  errorChars)
+        }
+        AAString(object)
+    }
+)
diff --git a/R/XStringSet-class.R b/R/XStringSet-class.R
index 6166aed9..e331e62d 100644
--- a/R/XStringSet-class.R
+++ b/R/XStringSet-class.R
@@ -624,3 +624,13 @@ setMethod("updateObject", "XStringSet",
     }
 )
 
+setMethod("updateObject", "AAStringSet",
+    function(object, ..., verbose=FALSE)
+    {
+        for(i in seq_along(object)){
+            object[[i]] <- updateObject(object[[i]])
+        }
+        object
+    }
+)
+
diff --git a/R/XStringSetList-class.R b/R/XStringSetList-class.R
index f1837780..752e2234 100644
--- a/R/XStringSetList-class.R
+++ b/R/XStringSetList-class.R
@@ -271,3 +271,13 @@ setMethod("showAsCell", "XStringSetList",
 
 setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList)
 
+setMethod("updateObject", "AAStringSetList",
+          function(object, ..., verbose=FALSE)
+          {
+              lst <- as.list(object)
+              for(i in seq_along(lst)){
+                  lst[[i]] <- updateObject(lst[[i]])
+              }
+              AAStringSetList(lst)
+          }
+)

From 2505f1c974846f49affcccf77401e4eecd23e372 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Wed, 12 Apr 2023 10:28:09 -0400
Subject: [PATCH 8/9] fix: comments

---
 R/XString-class.R        | 1 +
 R/XStringSet-class.R     | 2 ++
 R/XStringSetList-class.R | 7 +++++++
 3 files changed, 10 insertions(+)

diff --git a/R/XString-class.R b/R/XString-class.R
index 8567ebca..82e584b4 100644
--- a/R/XString-class.R
+++ b/R/XString-class.R
@@ -481,6 +481,7 @@ setMethod("updateObject", "XString",
     }
 )
 
+### Update AAString objects created before AA_ALPHABET was enforced
 setMethod("updateObject", "AAString",
     function(object, ..., verbose=FALSE)
     {
diff --git a/R/XStringSet-class.R b/R/XStringSet-class.R
index e331e62d..c6e6fedc 100644
--- a/R/XStringSet-class.R
+++ b/R/XStringSet-class.R
@@ -624,6 +624,8 @@ setMethod("updateObject", "XStringSet",
     }
 )
 
+### Update AAStringSet objects created before AA_ALPHABET was enforced
+### for AAString objects
 setMethod("updateObject", "AAStringSet",
     function(object, ..., verbose=FALSE)
     {
diff --git a/R/XStringSetList-class.R b/R/XStringSetList-class.R
index 752e2234..3f617a96 100644
--- a/R/XStringSetList-class.R
+++ b/R/XStringSetList-class.R
@@ -271,6 +271,13 @@ setMethod("showAsCell", "XStringSetList",
 
 setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList)
 
+
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### updateObject()
+###
+
+### Update AAStringSetList objects created before AA_ALPHABET was enforced
+### for AAString objects
 setMethod("updateObject", "AAStringSetList",
           function(object, ..., verbose=FALSE)
           {

From 0ab5eae4dc9f5f5403c3eacac875f42f39b3f223 Mon Sep 17 00:00:00 2001
From: ahl27 <ahl27@pitt.edu>
Date: Wed, 28 Jun 2023 12:22:01 -0400
Subject: [PATCH 9/9] Updates updateObject methods according to feedback from
 Herve

---
 R/XString-class.R        |  8 ++++++--
 R/XStringSet-class.R     | 24 ++++++++++++++++++++----
 R/XStringSetList-class.R | 18 ------------------
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/R/XString-class.R b/R/XString-class.R
index 82e584b4..cec49645 100644
--- a/R/XString-class.R
+++ b/R/XString-class.R
@@ -474,10 +474,11 @@ setMethod("updateObject", "XString",
         ans_shared <- new("SharedRaw")
         ans_shared@xp <- xdata@xp
         ans_shared@.link_to_cached_object=xdata@.link_to_cached_object
-        new(class(object),
+        new2(class(object),
             shared=ans_shared,
             offset=object@offset,
-            length=object@length)
+            length=object@length,
+            check=FALSE)
     }
 )
 
@@ -485,6 +486,9 @@ setMethod("updateObject", "XString",
 setMethod("updateObject", "AAString",
     function(object, ..., verbose=FALSE)
     {
+        # Start by calling general XString update function
+        object <- callNextMethod()
+
         codec <- xscodec(AAString())
         class(object) <- "BString"
         mapping <- vapply(uniqueLetters(object), utf8ToInt, integer(1L))
diff --git a/R/XStringSet-class.R b/R/XStringSet-class.R
index c6e6fedc..9e328074 100644
--- a/R/XStringSet-class.R
+++ b/R/XStringSet-class.R
@@ -624,15 +624,31 @@ setMethod("updateObject", "XStringSet",
     }
 )
 
+### Helper method to update general XStringSet objects efficiently
+.updateObject_XStringSet <- function(object, ..., verbose=FALSE){
+    baseclass <- xsbaseclass(object)
+    ### Update SharedRaw elements directly
+    ### Significantly fewer SharedRaw objects than XStrings,
+    ### So as long as we don't modify the order of the letters
+    ### this will be significantly faster
+    for(i in seq_along(object@pool)){
+        shared <- object@pool[[i]] # SharedRaw object
+        xs <- new2(baseclass, shared=shared, length=length(shared), check=FALSE)
+        xs <- updateObject(xs, verbose=verbose)
+        object@pool[[i]] <- xs@shared
+    }
+    object
+}
+
 ### Update AAStringSet objects created before AA_ALPHABET was enforced
 ### for AAString objects
 setMethod("updateObject", "AAStringSet",
     function(object, ..., verbose=FALSE)
     {
-        for(i in seq_along(object)){
-            object[[i]] <- updateObject(object[[i]])
-        }
-        object
+        # Start by calling general XStringSet update function
+        object <- callNextMethod()
+        object <- compact(object)
+        .updateObject_XStringSet(object, ..., verbose=verbose)
     }
 )
 
diff --git a/R/XStringSetList-class.R b/R/XStringSetList-class.R
index 3f617a96..5aa65cc2 100644
--- a/R/XStringSetList-class.R
+++ b/R/XStringSetList-class.R
@@ -270,21 +270,3 @@ setMethod("showAsCell", "XStringSetList",
 ###
 
 setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList)
-
-
-### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-### updateObject()
-###
-
-### Update AAStringSetList objects created before AA_ALPHABET was enforced
-### for AAString objects
-setMethod("updateObject", "AAStringSetList",
-          function(object, ..., verbose=FALSE)
-          {
-              lst <- as.list(object)
-              for(i in seq_along(lst)){
-                  lst[[i]] <- updateObject(lst[[i]])
-              }
-              AAStringSetList(lst)
-          }
-)