Skip to content

Commit

Permalink
fix: Enforcing character set for AAString objects (#97)
Browse files Browse the repository at this point in the history
  • Loading branch information
ahl27 authored Jul 1, 2023
1 parent 03aa8df commit 1a972b4
Show file tree
Hide file tree
Showing 14 changed files with 184 additions and 38 deletions.
27 changes: 24 additions & 3 deletions R/XString-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ setMethod("XString", "AsIs",
function(seqtype, x, start=NA, end=NA, width=NA)
{
if (!is.character(x))
stop("unsuported input type")
stop("unsupported input type")
class(x) <- "character" # keeps the names (unlike as.character())
XString(seqtype, x, start=start, end=end, width=width)
}
Expand Down Expand Up @@ -474,10 +474,31 @@ setMethod("updateObject", "XString",
ans_shared <- new("SharedRaw")
ans_shared@xp <- xdata@xp
ans_shared@.link_to_cached_object=xdata@.link_to_cached_object
new(class(object),
new2(class(object),
shared=ans_shared,
offset=object@offset,
length=object@length)
length=object@length,
check=FALSE)
}
)

### Update AAString objects created before AA_ALPHABET was enforced
setMethod("updateObject", "AAString",
function(object, ..., verbose=FALSE)
{
# Start by calling general XString update function
object <- callNextMethod()

codec <- xscodec(AAString())
class(object) <- "BString"
mapping <- vapply(uniqueLetters(object), utf8ToInt, integer(1L))
missingVals <- is.na(codec@enc_lkup[mapping+1L])
if(any(missingVals)){
errorChars <- paste(names(mapping)[which(missingVals)],
collapse=', ')
stop("Cannot decode, AAString contains invalid character(s): ",
errorChars)
}
AAString(object)
}
)
29 changes: 28 additions & 1 deletion R/XStringCodec-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,34 @@ getRNAComplementLookup <- function()
}



### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Add extra codecs below...
### The AA alphabet and codec.
###

### AAStrings don't need to support DNA/RNA or complementation,
### so the code to generate the codec can be a lot simpler
AAcodes <- function(baseOnly)
{
if (!isTRUEorFALSE(baseOnly))
stop("'baseOnly' must be TRUE or FALSE")
letters <- AA_ALPHABET
if (baseOnly)
letters <- head(letters, n=20L)
setNames(.letterAsByteVal(letters), letters)
}

### AA codec.
.XStringCodec.AA <- function(codes)
{
letters <- names(codes)
extra_letters <- setdiff(tolower(letters), letters)
extra_codes <- .letterAsByteVal(toupper(extra_letters))
new("XStringCodec", letters, codes, extra_letters, extra_codes)
}

AA_LOOKUP_CODES <- AAcodes(FALSE)
AA_STRING_CODEC <- .XStringCodec.AA(AA_LOOKUP_CODES)

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Add extra codecs below...
28 changes: 28 additions & 0 deletions R/XStringSet-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -624,3 +624,31 @@ setMethod("updateObject", "XStringSet",
}
)

### Helper method to update general XStringSet objects efficiently
.updateObject_XStringSet <- function(object, ..., verbose=FALSE){
baseclass <- xsbaseclass(object)
### Update SharedRaw elements directly
### Significantly fewer SharedRaw objects than XStrings,
### So as long as we don't modify the order of the letters
### this will be significantly faster
for(i in seq_along(object@pool)){
shared <- object@pool[[i]] # SharedRaw object
xs <- new2(baseclass, shared=shared, length=length(shared), check=FALSE)
xs <- updateObject(xs, verbose=verbose)
object@pool[[i]] <- xs@shared
}
object
}

### Update AAStringSet objects created before AA_ALPHABET was enforced
### for AAString objects
setMethod("updateObject", "AAStringSet",
function(object, ..., verbose=FALSE)
{
# Start by calling general XStringSet update function
object <- callNextMethod()
object <- compact(object)
.updateObject_XStringSet(object, ..., verbose=verbose)
}
)

1 change: 0 additions & 1 deletion R/XStringSetList-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -270,4 +270,3 @@ setMethod("showAsCell", "XStringSetList",
###

setMethod("nchar", "XStringSetList", IRanges:::nchar_CompressedList)

32 changes: 20 additions & 12 deletions R/letterFrequency.R
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,13 @@
ans
}

.XString.amino_acid_frequency <- function(x, as.prob)
.XString.amino_acid_frequency <- function(x, as.prob, baseOnly)
{
if (!isTRUEorFALSE(as.prob))
stop("'as.prob' must be TRUE or FALSE")
codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse="")))
names(codes) <- AA_ALPHABET
codes <- xscodes(x, baseOnly=baseOnly)
ans <- .Call2("XString_letter_frequency",
x, codes, TRUE,
x, codes, baseOnly,
PACKAGE="Biostrings")
if (as.prob)
ans <- ans / nchar(x) # nchar(x) is sum(ans) but faster
Expand Down Expand Up @@ -180,15 +179,14 @@
ans
}

.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse)
.XStringSet.amino_acid_frequency <- function(x, as.prob, collapse, baseOnly)
{
if (!isTRUEorFALSE(as.prob))
stop("'as.prob' must be TRUE or FALSE")
collapse <- .normargCollapse(collapse)
codes <- as.integer(AAString(paste0(AA_ALPHABET, collapse="")))
names(codes) <- AA_ALPHABET
codes <- xscodes(x, baseOnly=baseOnly)
ans <- .Call2("XStringSet_letter_frequency",
x, collapse, codes, TRUE,
x, collapse, codes, baseOnly,
PACKAGE="Biostrings")
if (as.prob) {
if (collapse)
Expand Down Expand Up @@ -219,8 +217,8 @@ setMethod("alphabetFrequency", "RNAString",
)

setMethod("alphabetFrequency", "AAString",
function(x, as.prob=FALSE)
.XString.amino_acid_frequency(x, as.prob)
function(x, as.prob=FALSE, baseOnly=FALSE)
.XString.amino_acid_frequency(x, as.prob, baseOnly)
)

setMethod("alphabetFrequency", "XStringSet",
Expand All @@ -239,8 +237,8 @@ setMethod("alphabetFrequency", "RNAStringSet",
)

setMethod("alphabetFrequency", "AAStringSet",
function(x, as.prob=FALSE, collapse=FALSE)
.XStringSet.amino_acid_frequency(x, as.prob, collapse)
function(x, as.prob=FALSE, collapse=FALSE, baseOnly=FALSE)
.XStringSet.amino_acid_frequency(x, as.prob, collapse, baseOnly)
)

### library(drosophila2probe)
Expand Down Expand Up @@ -281,6 +279,11 @@ setMethod("hasOnlyBaseLetters", "RNAString",
function(x) hasOnlyBaseLetters(DNAString(x))
)

setMethod("hasOnlyBaseLetters", "AAString",
function(x)
alphabetFrequency(x, baseOnly=TRUE)[["other"]] == 0L
)

setMethod("hasOnlyBaseLetters", "DNAStringSet",
function(x)
alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L
Expand All @@ -290,6 +293,11 @@ setMethod("hasOnlyBaseLetters", "RNAStringSet",
function(x) hasOnlyBaseLetters(DNAStringSet(x))
)

setMethod("hasOnlyBaseLetters", "AAStringSet",
function(x)
alphabetFrequency(x, collapse=TRUE, baseOnly=TRUE)[["other"]] == 0L
)

setMethod("hasOnlyBaseLetters", "XStringViews",
function(x)
{
Expand Down
18 changes: 11 additions & 7 deletions R/seqtype.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
### "B" | general purpose string(s) | bytes 0-255 | no
### "DNA" | DNA sequence(s) | DNA_ALPHABET | yes
### "RNA" | RNA sequence(s) | RNA_ALPHABET | yes
### "AA" | amino acid sequence(s) | AA_ALPHABET | no
### "AA" | amino acid sequence(s) | AA_ALPHABET | yes
###
### seqtype() returns that sequence type. For example 'seqtype(AAString())'
### returns "AA".
Expand All @@ -25,7 +25,7 @@
### string containers: XString (single sequence), XStringSet (multiple
### sequences), XStringViews (multiple sequences) and MaskedXString (single
### sequence).
###
###

### Exported.
setGeneric("seqtype", function(x) standardGeneric("seqtype"))
Expand All @@ -35,11 +35,10 @@ setGeneric("seqtype<-", signature="x",
function(x, value) standardGeneric("seqtype<-")
)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Helper functions for which the returned value depends on 'seqtype(x)',
### not on what particular data are in 'x'. Not exported.
###
###

xsbaseclass <- function(x) paste(seqtype(x), "String", sep="")

Expand All @@ -56,6 +55,7 @@ setMethod("xscodes", "ANY",
switch(seqtype,
DNA=DNAcodes(baseOnly),
RNA=RNAcodes(baseOnly),
AA=AAcodes(baseOnly),
0:255
)
}
Expand All @@ -66,6 +66,7 @@ xscodec <- function(x)
switch(seqtype(x),
DNA=DNA_STRING_CODEC,
RNA=RNA_STRING_CODEC,
AA=AA_STRING_CODEC,
NULL
)
}
Expand Down Expand Up @@ -109,18 +110,22 @@ get_seqtype_conversion_lookup <- function(from_seqtype, to_seqtype)
if (!compatible_seqtypes(from_seqtype, to_seqtype))
stop("incompatible sequence types \"",
from_seqtype, "\" and \"", to_seqtype, "\"")
from_nucleo <- from_seqtype %in% c("DNA", "RNA")
to_nucleo <- to_seqtype %in% c("DNA", "RNA")
from_nucleo <- from_seqtype %in% c("DNA", "RNA", "AA")
to_nucleo <- to_seqtype %in% c("DNA", "RNA", "AA")
if (from_nucleo == to_nucleo)
return(NULL)
if (to_seqtype == "DNA")
return(DNA_STRING_CODEC@enc_lkup)
if (to_seqtype == "RNA")
return(RNA_STRING_CODEC@enc_lkup)
if (to_seqtype == "AA")
return(AA_STRING_CODEC@enc_lkup)
if (from_seqtype == "DNA")
return(DNA_STRING_CODEC@dec_lkup)
if (from_seqtype == "RNA")
return(RNA_STRING_CODEC@dec_lkup)
if (from_seqtype == "AA")
return(AA_STRING_CODEC@dec_lkup)
stop("Biostrings internal error, please report") # should never happen
}

Expand Down Expand Up @@ -155,4 +160,3 @@ setMethod("alphabet", "ANY",
)
}
)

3 changes: 3 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
.Call2("init_RNAlkups",
RNA_STRING_CODEC@enc_lkup, RNA_STRING_CODEC@dec_lkup,
PACKAGE=pkgname)
.Call2("init_AAlkups",
AA_STRING_CODEC@enc_lkup, AA_STRING_CODEC@dec_lkup,
PACKAGE=pkgname)
DNA_AND_RNA_COLORED_LETTERS <<- make_DNA_AND_RNA_COLORED_LETTERS()
AA_COLORED_LETTERS <<- make_AA_COLORED_LETTERS()
option_name <- "Biostrings.coloring"
Expand Down
4 changes: 4 additions & 0 deletions inst/include/Biostrings_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ char RNAencode(char c);

char RNAdecode(char code);

char AAencode(char c);

char AAdecode(char code);

/*
* Low-level manipulation of XStringSet objects.
* (see XStringSet_class.c)
Expand Down
10 changes: 10 additions & 0 deletions inst/include/_Biostrings_stubs.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@ DEFINE_CCALLABLE_STUB(char, RNAdecode,
( code)
)

DEFINE_CCALLABLE_STUB(char, AAencode,
(char c),
( c)
)

DEFINE_CCALLABLE_STUB(char, AAdecode,
(char code),
( code)
)

/*
* Stubs for callables defined in XStringSet_class.c
*/
Expand Down
9 changes: 4 additions & 5 deletions man/XStringSet-io.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,8 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
\item \code{"RNA"} for RNA sequences i.e. only letters in
\code{\link{RNA_ALPHABET}} (case ignored) are valid
one-letter sequence codes.
\item \code{"AA"} for Amino Acid sequences. Currently treated as
\code{"B"} but this will change in the near future i.e. only
letters in \code{\link{AA_ALPHABET}} (case ignored) will be
\item \code{"AA"} for Amino Acid sequences i.e. only
letters in \code{\link{AA_ALPHABET}} (case ignored) are
valid one-letter sequence codes.
}
Invalid one-letter sequence codes are ignored with a warning.
Expand Down Expand Up @@ -192,7 +191,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
\item{save.dups}{
\code{TRUE} or \code{FALSE}.
If \code{TRUE} then the \code{\link[IRanges:Grouping-class]{Dups}}
object describing
object describing
how duplicated elements in \code{x} are related to each other is
saved too. For advanced users only.
}
Expand All @@ -210,7 +209,7 @@ saveXStringSet(x, objname, dirpath=".", save.dups=FALSE, verbose=TRUE)
load sequences from an input file (or multiple input files) into an
\link{XStringSet} object. When multiple input files are specified,
all must have the same format (i.e. FASTA or FASTQ) and files with
different compression types can be mixed with non-compressed files.
different compression types can be mixed with non-compressed files.
The files are read in the order they were specified and the sequences
are stored in the returned object in the order they were read.

Expand Down
19 changes: 11 additions & 8 deletions man/letterFrequency.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
\S4method{consensusString}{matrix}(x, ambiguityMap="?", threshold=0.5)
\S4method{consensusString}{DNAStringSet}(x, ambiguityMap=IUPAC_CODE_MAP,
threshold=0.25, shift=0L, width=NULL)
\S4method{consensusString}{RNAStringSet}(x,
\S4method{consensusString}{RNAStringSet}(x,
ambiguityMap=
structure(as.character(RNAStringSet(DNAStringSet(IUPAC_CODE_MAP))),
names=
Expand Down Expand Up @@ -210,7 +210,7 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
those sequences have been shifted (see the \code{shift} argument above).
This ensures that any wider consensus matrix would be a "padded with zeros"
version of the matrix returned when \code{width=NULL}.
The length of the returned sequence for the \code{consensusString}
method for \link{XStringSet} objects.
}
Expand Down Expand Up @@ -255,10 +255,12 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)
\link{XStringSet} or \link{XStringViews} object, then it returns
an integer matrix with \code{length(x)} rows where the
\code{i}-th row contains the frequencies for \code{x[[i]]}.
If \code{x} is a DNA or RNA input, then the returned vector is named
If \code{x} is a DNA, RNA, or AA input, then the returned vector is named
with the letters in the alphabet. If the \code{baseOnly} argument is
\code{TRUE}, then the returned vector has only 5 elements: 4 elements
corresponding to the 4 nucleotides + the 'other' element.
\code{TRUE}, then the returned vector has only 5 elements for DNA/RNA input
(4 elements corresponding to the 4 nucleotides + the 'other' element) and
21 elements for AA input (20 elements corresponding to the 20 base amino acids
+ the 'other' element).

\code{letterFrequency} returns, similarly, an integer vector or matrix,
but restricted and/or collated according to \code{letters} and \code{OR}.
Expand All @@ -270,7 +272,8 @@ consensusMatrix(x, as.prob=FALSE, shift=0L, width=NULL, ...)

\code{hasOnlyBaseLetters} returns \code{TRUE} or \code{FALSE} indicating
whether or not \code{x} contains only base letters (i.e. As, Cs, Gs and Ts
for DNA input and As, Cs, Gs and Us for RNA input).
for DNA input, As, Cs, Gs and Us for RNA input, or any of the 20 standard
amino acids for AA input).

\code{uniqueLetters} returns a vector of 1-letter or empty strings. The empty
string is used to represent the nul character if \code{x} happens to contain
Expand Down Expand Up @@ -391,8 +394,8 @@ consensusString(sort(probes)[1:5], ambiguityMap = "N", threshold = 0.5)
## Consensus involving ambiguity letters in the input strings
consensusString(DNAStringSet(c("NNNN","ACTG")))
consensusString(DNAStringSet(c("AANN","ACTG")))
consensusString(DNAStringSet(c("ACAG","ACAR")))
consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG")))
consensusString(DNAStringSet(c("ACAG","ACAR")))
consensusString(DNAStringSet(c("ACAG","ACAR", "ACAG")))

## ---------------------------------------------------------------------
## C. RELATIONSHIP BETWEEN consensusMatrix() AND coverage()
Expand Down
Loading

0 comments on commit 1a972b4

Please sign in to comment.