Add guessparse function

This function is a quick-and-dirty parser function from `AbstractString` to `LongSequence`, with autodetection of the alphabet. It's meant to be used in ephemeral REPL work, and very clearly documented to be unstable and subject to change. See BioJulia#268
jakobnissen · Feb 27, 2023 · fdbb494 · fdbb494
1 parent 4a31474
commit fdbb494
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 1 deletion.
diff --git a/src/BioSequences.jl b/src/BioSequences.jl
@@ -133,6 +133,7 @@ export
     ###
     ### LongSequence
     ###
+    guessparse,
 
     # Type & aliases
     LongSequence,

diff --git a/src/alphabet.jl b/src/alphabet.jl
@@ -308,3 +308,37 @@ for (anum, atype) in enumerate((DNAAlphabet{4}, DNAAlphabet{2}, RNAAlphabet{4},
         ascii_encode(::$(atype), x::UInt8) = @inbounds $(tablename)[x + 1]
     end
 end
+
+const GUESS_LUT = let
+    v = zeros(UInt8, 128)
+    for (off, A) in [
+        (0, DNAAlphabet{4}()),
+        (1, RNAAlphabet{4}()),
+        (2, DNAAlphabet{2}()), (2, RNAAlphabet{2}()),
+        (3, AminoAcidAlphabet())
+    ]
+        for i in A
+            for b in (UInt8(lowercase(Char(i))), UInt8(uppercase(Char(i))))
+                v[b + 0x01] |= 0x01 << off
+            end
+        end
+    end
+    NTuple{128, UInt8}(v)
+end
+
+"""
+    possible_encodings(b::UInt8)::UInt8
+
+Returns a `UInt8` with any of the 4 lower bits set:
+* Bit 0: Valid `RNA`
+* Bit 1: Valid `RNA`
+* Bit 2: Valid 2-bit nucleotide
+* Bit 3: Valid `AminoAcidAlphabet`
+"""
+function possible_encodings(b::UInt8)
+    mask = @inbounds GUESS_LUT[(b & 0x7f) + 0x01]
+    # This is just a way to set the result to 0x00 if b > 0x7f
+    # which compiles efficiently
+    mask >> (ifelse(b > 0x7f, 0xff, 0x00) & 0x07)
+end
+
diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -85,4 +85,57 @@ function LongSequence{A}(
     return copyto!(seq, 1, src, first(part), len)
 end
 
-Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
+Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
+
+guess_alphabet(s::Union{String, SubString{String}}) = guess_alphabet(codeunits(s))
+function guess_alphabet(v::AbstractVector{UInt8})
+    mask = mapreduce(possible_encodings, &, v; init=0x0f)
+    dna = isodd(mask >> 0x00)
+    rna = isodd(mask >> 0x01)
+    unambiguous = isodd(mask >> 0x02)
+    aa = isodd(mask >> 0x03)
+    if dna & rna
+        error("Sequences is both valid DNA and RNA")
+    elseif dna
+        unambiguous ? DNAAlphabet{2} : DNAAlphabet{4}
+    elseif rna
+        unambiguous ? RNAAlphabet{2} : RNAAlphabet{4}
+    elseif aa
+        AminoAcidAlphabet
+    else
+        error("Sequence is not valid DNA, RNA or amino acid.")
+    end
+end
+
+"""
+    guessparse(s::AbstractString)::BioSequence
+
+Parse `s` into a `BioSequence`, and tries to guess which kind of biosequence.
+The precise guessing algorithm is an implementation detail and not to be relied on.
+This function is meant to be used in ephemeral REPL work, not in package code.
+Its precise behaviour is subject to change in minor versions.
+
+# Current behaviour (subject to change)
+Currently, `guessparse` will error on sequences that can be either DNA or RNA sequences,
+and on sequences that are neither DNA, RNA or aminoacid sequences.
+It will return a `LongSequence{A}`, where `A` is determined in the following priority:
+2-bit DNA/RNAAlphabet -> 4-bit DNA/RNAAlphabet -> AminoAcidAlphabet
+
+# Examples:
+```
+julia> typeof(guessparse("AGTGCA"))
+LongDNA{2}
+
+julia> typeof(guessparse("AGCGAWSN"))
+Error:
+[...]
+
+julia> typeof(guessparse("UGAUCSSDDC"))
+LongRNA{4}
+
+julia> typeof(guessparse("KLEWSNYKHACQQV"))
+LongAA
+```
+"""
+guessparse(v::Union{SubString{String}, String}) = LongSequence{guess_alphabet(v)}(v)
+guessparse(s::AbstractString) = guessparse(String(s))