Skip to content

Commit

Permalink
Add guessparse function
Browse files Browse the repository at this point in the history
This function is a quick-and-dirty parser function from `AbstractString` to
`LongSequence`, with autodetection of the alphabet.
It's meant to be used in ephemeral REPL work, and very clearly documented to
be unstable and subject to change.

See BioJulia#268
  • Loading branch information
jakobnissen committed Feb 27, 2023
1 parent 4a31474 commit fdbb494
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/BioSequences.jl
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ export
###
### LongSequence
###
guessparse,

# Type & aliases
LongSequence,
Expand Down
34 changes: 34 additions & 0 deletions src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,37 @@ for (anum, atype) in enumerate((DNAAlphabet{4}, DNAAlphabet{2}, RNAAlphabet{4},
ascii_encode(::$(atype), x::UInt8) = @inbounds $(tablename)[x + 1]
end
end

const GUESS_LUT = let
v = zeros(UInt8, 128)
for (off, A) in [
(0, DNAAlphabet{4}()),
(1, RNAAlphabet{4}()),
(2, DNAAlphabet{2}()), (2, RNAAlphabet{2}()),
(3, AminoAcidAlphabet())
]
for i in A
for b in (UInt8(lowercase(Char(i))), UInt8(uppercase(Char(i))))
v[b + 0x01] |= 0x01 << off
end
end
end
NTuple{128, UInt8}(v)
end

"""
possible_encodings(b::UInt8)::UInt8
Returns a `UInt8` with any of the 4 lower bits set:
* Bit 0: Valid `RNA`
* Bit 1: Valid `RNA`
* Bit 2: Valid 2-bit nucleotide
* Bit 3: Valid `AminoAcidAlphabet`
"""
function possible_encodings(b::UInt8)
mask = @inbounds GUESS_LUT[(b & 0x7f) + 0x01]
# This is just a way to set the result to 0x00 if b > 0x7f
# which compiles efficiently
mask >> (ifelse(b > 0x7f, 0xff, 0x00) & 0x07)
end

55 changes: 54 additions & 1 deletion src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,57 @@ function LongSequence{A}(
return copyto!(seq, 1, src, first(part), len)
end

Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)

guess_alphabet(s::Union{String, SubString{String}}) = guess_alphabet(codeunits(s))
function guess_alphabet(v::AbstractVector{UInt8})
mask = mapreduce(possible_encodings, &, v; init=0x0f)
dna = isodd(mask >> 0x00)
rna = isodd(mask >> 0x01)
unambiguous = isodd(mask >> 0x02)
aa = isodd(mask >> 0x03)
if dna & rna
error("Sequences is both valid DNA and RNA")
elseif dna
unambiguous ? DNAAlphabet{2} : DNAAlphabet{4}
elseif rna
unambiguous ? RNAAlphabet{2} : RNAAlphabet{4}
elseif aa
AminoAcidAlphabet
else
error("Sequence is not valid DNA, RNA or amino acid.")
end
end

"""
guessparse(s::AbstractString)::BioSequence
Parse `s` into a `BioSequence`, and tries to guess which kind of biosequence.
The precise guessing algorithm is an implementation detail and not to be relied on.
This function is meant to be used in ephemeral REPL work, not in package code.
Its precise behaviour is subject to change in minor versions.
# Current behaviour (subject to change)
Currently, `guessparse` will error on sequences that can be either DNA or RNA sequences,
and on sequences that are neither DNA, RNA or aminoacid sequences.
It will return a `LongSequence{A}`, where `A` is determined in the following priority:
2-bit DNA/RNAAlphabet -> 4-bit DNA/RNAAlphabet -> AminoAcidAlphabet
# Examples:
```
julia> typeof(guessparse("AGTGCA"))
LongDNA{2}
julia> typeof(guessparse("AGCGAWSN"))
Error:
[...]
julia> typeof(guessparse("UGAUCSSDDC"))
LongRNA{4}
julia> typeof(guessparse("KLEWSNYKHACQQV"))
LongAA
```
"""
guessparse(v::Union{SubString{String}, String}) = LongSequence{guess_alphabet(v)}(v)
guessparse(s::AbstractString) = guessparse(String(s))

0 comments on commit fdbb494

Please sign in to comment.