diff --git a/src/BioSequences.jl b/src/BioSequences.jl index 111d33b1..4fd9889c 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -133,6 +133,7 @@ export ### ### LongSequence ### + guessparse, # Type & aliases LongSequence, diff --git a/src/alphabet.jl b/src/alphabet.jl index 6d273779..615e4d20 100644 --- a/src/alphabet.jl +++ b/src/alphabet.jl @@ -308,3 +308,37 @@ for (anum, atype) in enumerate((DNAAlphabet{4}, DNAAlphabet{2}, RNAAlphabet{4}, ascii_encode(::$(atype), x::UInt8) = @inbounds $(tablename)[x + 1] end end + +const GUESS_LUT = let + v = zeros(UInt8, 128) + for (off, A) in [ + (0, DNAAlphabet{4}()), + (1, RNAAlphabet{4}()), + (2, DNAAlphabet{2}()), (2, RNAAlphabet{2}()), + (3, AminoAcidAlphabet()) + ] + for i in A + for b in (UInt8(lowercase(Char(i))), UInt8(uppercase(Char(i)))) + v[b + 0x01] |= 0x01 << off + end + end + end + NTuple{128, UInt8}(v) +end + +""" + possible_encodings(b::UInt8)::UInt8 + +Returns a `UInt8` with any of the 4 lower bits set: +* Bit 0: Valid `RNA` +* Bit 1: Valid `RNA` +* Bit 2: Valid 2-bit nucleotide +* Bit 3: Valid `AminoAcidAlphabet` +""" +function possible_encodings(b::UInt8) + mask = @inbounds GUESS_LUT[(b & 0x7f) + 0x01] + # This is just a way to set the result to 0x00 if b > 0x7f + # which compiles efficiently + mask >> (ifelse(b > 0x7f, 0xff, 0x00) & 0x07) +end + diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index abdd2f94..9d2b6011 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -85,4 +85,57 @@ function LongSequence{A}( return copyto!(seq, 1, src, first(part), len) end -Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq) \ No newline at end of file +Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq) + +guess_alphabet(s::Union{String, SubString{String}}) = guess_alphabet(codeunits(s)) +function guess_alphabet(v::AbstractVector{UInt8}) + mask = mapreduce(possible_encodings, &, v; init=0x0f) + dna = isodd(mask >> 0x00) + rna = isodd(mask >> 0x01) + unambiguous = isodd(mask >> 0x02) + aa = isodd(mask >> 0x03) + if dna & rna + error("Sequences is both valid DNA and RNA") + elseif dna + unambiguous ? DNAAlphabet{2} : DNAAlphabet{4} + elseif rna + unambiguous ? RNAAlphabet{2} : RNAAlphabet{4} + elseif aa + AminoAcidAlphabet + else + error("Sequence is not valid DNA, RNA or amino acid.") + end +end + +""" + guessparse(s::AbstractString)::BioSequence + +Parse `s` into a `BioSequence`, and tries to guess which kind of biosequence. +The precise guessing algorithm is an implementation detail and not to be relied on. +This function is meant to be used in ephemeral REPL work, not in package code. +Its precise behaviour is subject to change in minor versions. + +# Current behaviour (subject to change) +Currently, `guessparse` will error on sequences that can be either DNA or RNA sequences, +and on sequences that are neither DNA, RNA or aminoacid sequences. +It will return a `LongSequence{A}`, where `A` is determined in the following priority: +2-bit DNA/RNAAlphabet -> 4-bit DNA/RNAAlphabet -> AminoAcidAlphabet + +# Examples: +``` +julia> typeof(guessparse("AGTGCA")) +LongDNA{2} + +julia> typeof(guessparse("AGCGAWSN")) +Error: +[...] + +julia> typeof(guessparse("UGAUCSSDDC")) +LongRNA{4} + +julia> typeof(guessparse("KLEWSNYKHACQQV")) +LongAA +``` +""" +guessparse(v::Union{SubString{String}, String}) = LongSequence{guess_alphabet(v)}(v) +guessparse(s::AbstractString) = guessparse(String(s)) \ No newline at end of file