Various internal code cleanup (#323)

* Remove some unused, internal functions * Use `trailing_zeros` instead of LUT for 4->2 bit transcoding * Remove validity check when encoding, since invalid biosymbols should never exist * Remove some small source code files, and move their content to other files * Optimise a commonly used bitindex method * Optimise equality for SeqOrView of identical alphabets
BioJulia · Oct 24, 2024 · 8fa5616 · 8fa5616
1 parent 0047647
commit 8fa5616
Show file tree

Hide file tree

Showing 12 changed files with 61 additions and 125 deletions.
diff --git a/src/alphabet.jl b/src/alphabet.jl
@@ -171,24 +171,17 @@ BitsPerSymbol(::A) where A <: NucleicAcidAlphabet{4} = BitsPerSymbol{4}()
 
 
 ## Encoding and decoding DNA and RNA alphabet symbols
-
-# A nucleotide with bitvalue B has kmer-bitvalue kmerbits[B+1].
-# ambiguous nucleotides have no kmervalue, here set to 0xff
-const twobitnucs = (0xff, 0x00, 0x01, 0xff,
-                    0x02, 0xff, 0xff, 0xff,
-                    0x03, 0xff, 0xff, 0xff,
-                    0xff, 0xff, 0xff, 0xff)
-
 for A in (DNAAlphabet, RNAAlphabet)
     T = eltype(A)
     @eval begin
 
 	    # 2-bit encoding
         @inline function encode(::$(A){2}, nt::$(T))
-            if count_ones(nt) != 1 || !isvalid(nt)
+            u = reinterpret(UInt8, nt)
+            if count_ones(u) != 1
                 throw(EncodeError($(A){2}(), nt))
             end
-            return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01])
+            trailing_zeros(u) % UInt
         end
 
         @inline function decode(::$(A){2}, x::UInt)
@@ -199,9 +192,6 @@ for A in (DNAAlphabet, RNAAlphabet)
 
         # 4-bit encoding
         @inline function encode(::$(A){4}, nt::$(T))
-            if !isvalid(nt)
-                throw(EncodeError($(A){4}(), nt))
-            end
             return convert(UInt, reinterpret(UInt8, nt))
         end
 

diff --git a/src/biosequence/biosequence.jl b/src/biosequence/biosequence.jl
@@ -41,6 +41,20 @@ For compatibility with existing `Alphabet`s, the encoded data eltype must be `UI
 """
 abstract type BioSequence{A<:Alphabet} end
 
+function (::Type{S})(seq::BioSequence) where {S <: AbstractString}
+    _string(S, seq, codetype(Alphabet(seq)))
+end
+
+Base.LazyString(seq::BioSequence) = LazyString(string(seq))
+
+function _string(::Type{S}, seq::BioSequence, ::AlphabetCode) where {S<:AbstractString}
+    return S([Char(x) for x in seq])
+end
+
+function _string(::Type{String}, seq::BioSequence, ::AsciiAlphabet)
+    String([stringbyte(s) for s in seq])
+end
+
 """
     has_interface(::Type{BioSequence}, ::T, syms::Vector, mutable::Bool, compat::Bool=true)
 
@@ -230,7 +244,6 @@ const AASeq = BioSequence{AminoAcidAlphabet}
 
 # The generic functions for any BioSequence...
 include("indexing.jl")
-include("conversion.jl")
 include("predicates.jl")
 include("find.jl")
 include("printing.jl")

diff --git a/src/biosequence/conversion.jl b/src/biosequence/conversion.jl
diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl
@@ -38,27 +38,6 @@ end
     return count_ones((c | g) & ~(a | t))
 end
 
-@inline a_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_00_bitpairs(x)
-@inline c_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_01_bitpairs(x)
-@inline g_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_10_bitpairs(x)
-@inline t_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_11_bitpairs(x)
-
-@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
-    return count_nonzero_nibbles(a ⊻ b)
-end
-
-@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}}
-    return count_nonzero_bitpairs(a ⊻ b)
-end
-
-@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
-    return count_0000_nibbles(a ⊻ b)
-end
-
-@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}}
-    return count_00_bitpairs(a ⊻ b)
-end
-
 @inline function ambiguous_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
     return count_nonzero_nibbles(enumerate_nibbles(x) & 0xEEEEEEEEEEEEEEEE)
 end
@@ -67,19 +46,6 @@ end
     return count_compared_bytes(i -> (i > 0x15) & (i < 0x1a), x)
 end
 
-@inline function ambiguous_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
-    return count_nonzero_nibbles((enumerate_nibbles(a) | enumerate_nibbles(b)) & 0xEEEEEEEEEEEEEEEE)
-end
-
-@inline function gap_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
-    return count_0000_nibbles(x)
-end
-
-@inline function gap_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
-    # Count the gaps in a, count the gaps in b, subtract the number of shared gaps.
-    return count_0000_nibbles(a) + count_0000_nibbles(b) - count_0000_nibbles(a | b)
-end
-
 @inline function count_compared_bytes(f::F, x::UInt64) where F
     z = @inline reinterpret(NTuple{8, VecElement{UInt8}}, x)
     @inline sum(map(i -> f(i.value), z))
@@ -91,12 +57,6 @@ end
     return count_0000_nibbles(x)
 end
 
-@inline function certain_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
-    x = enumerate_nibbles(a) ⊻ 0x1111111111111111
-    y = enumerate_nibbles(b) ⊻ 0x1111111111111111
-    return count_0000_nibbles(x | y)
-end
-
 @inline function four_to_two_bits(x::UInt64)::UInt32
     m1 = 0x1111111111111111
     m2 = m1 << 1

diff --git a/src/bit-manipulation/bitindex.jl b/src/bit-manipulation/bitindex.jl
@@ -32,8 +32,10 @@ end
 BitsPerSymbol(::BitIndex{N, W}) where {N,W} = BitsPerSymbol{N}()
 bits_per_symbol(::BitIndex{N, W}) where {N,W} = N
 
-@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i) where {N,W}
-    return BitIndex{N, W}((i - 1) << trailing_zeros(N))
+bitindex(B::BitsPerSymbol, ::Type{W}, i::Integer) where W = bitindex(B, W, Int(i)::Int)
+
+@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i::Union{Int, UInt}) where {N,W}
+    return BitIndex{N, W}(((i - 1) % UInt) << trailing_zeros(N))
 end
 
 bitwidth(::Type{W}) where W = 8*sizeof(W)

diff --git a/src/geneticcode.jl b/src/geneticcode.jl
@@ -9,13 +9,10 @@
 
 const XNA = Union{DNA, RNA}
 function unambiguous_codon(a::XNA, b::XNA, c::XNA)
-    @inbounds begin
-    bits = twobitnucs[reinterpret(UInt8, a) + 0x01] << 4 |
-    twobitnucs[reinterpret(UInt8, b) + 0x01] << 2 |
-    twobitnucs[reinterpret(UInt8, c) + 0x01]
-    end
-    #reinterpret(RNACodon, bits % UInt64)
-    return bits % UInt64
+    bits = (trailing_zeros(reinterpret(UInt8, a)) << 4) |
+        (trailing_zeros(reinterpret(UInt8, b)) << 2) |
+        (trailing_zeros(reinterpret(UInt8, c)) << 0)
+    bits % UInt64
 end
 
 # A genetic code is a table mapping RNA 3-mers (i.e. RNAKmer{3}) to AminoAcids.

diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -7,6 +7,27 @@
 ### This file is a part of BioJulia.
 ### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md
 
+###
+### Promotion
+###
+for alph in (DNAAlphabet, RNAAlphabet)
+    @eval function Base.promote_rule(::Type{LongSequence{A}}, ::Type{LongSequence{B}}) where {A<:$alph,B<:$alph}
+        return LongSequence{promote_rule(A, B)}
+    end
+end
+
+###
+### Conversion
+###
+Base.convert(::Type{T}, seq::T) where {T <: LongSequence} = seq
+Base.convert(::Type{T}, seq::T) where {T <: LongSequence{<:NucleicAcidAlphabet}} = seq
+
+function Base.convert(::Type{T}, seq::LongSequence{<:NucleicAcidAlphabet}) where
+         {T<:LongSequence{<:NucleicAcidAlphabet}}
+    return T(seq)
+end
+
+
 @inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s))
 
 @inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet

diff --git a/src/longsequences/conversion.jl b/src/longsequences/conversion.jl
diff --git a/src/longsequences/longsequence.jl b/src/longsequences/longsequence.jl
@@ -116,7 +116,6 @@ include("seqview.jl")
 include("chunk_iterator.jl")
 include("indexing.jl")
 include("constructors.jl")
-include("conversion.jl")
 include("copying.jl")
 include("stringliterals.jl")
 include("transformations.jl")

diff --git a/src/longsequences/operators.jl b/src/longsequences/operators.jl
@@ -201,15 +201,20 @@ function Base.:(==)(seq1::SeqOrView{A}, seq2::SeqOrView{A}) where {A <: Alphabet
     length(seq1) == length(seq2) || return false
 
     # If they share the same data
+    isempty(seq1) && return true
     if seq1.data === seq2.data && firstbitindex(seq1) == firstbitindex(seq2)
         return true
     end
 
-    # Fallback
-    for (i, j) in zip(seq1, seq2)
+    # Check all filled UInts
+    (it, (ch1, ch2, rem)) = iter_chunks(seq1, seq2)
+    for (i, j) in it
         i == j || return false
     end
-    return true
+
+    # Check last coding UInt (or compare two zeros, if none)
+    mask = UInt64(1) << (rem & 63) - 1
+    return (ch1 & mask) == (ch2 & mask)
 end
 
 function Base.:(==)(seq1::LongSequence{A}, seq2::LongSequence{A}) where {A <: Alphabet}

diff --git a/src/longsequences/transformations.jl b/src/longsequences/transformations.jl
@@ -3,10 +3,14 @@
 ###
 
 """
-    resize!(seq, size, [force::Bool])
+    resize!(seq, size, [force::Bool=false])
 
 Resize a biological sequence `seq`, to a given `size`. Does not resize the underlying data
 array unless the new size does not fit. If `force`, always resize underlying data array.
+
+Note that resizing to a larger size, and then loading from uninitialized positions
+is not allowed and may cause undefined behaviour. 
+Make sure to always fill any uninitialized biosymbols after resizing.
 """
 function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A}
     if size < 0
@@ -67,9 +71,8 @@ end
 # all chunks by up to 63 bits.
 # This is written so it SIMD parallelizes - careful with changes
 @inline function zero_offset!(seq::LongSequence{A}) where A <: Alphabet
-    isempty(seq) && return seq
     offs = (64 - offset(bitindex(seq, length(seq)) + bits_per_symbol(A()))) % UInt
-    zero_offset!(seq, offs) 
+    zero_offset!(seq, offs)
 end
 
 @inline function zero_offset!(seq::LongSequence{A}, offs::UInt) where A <: Alphabet
@@ -165,4 +168,4 @@ function Random.shuffle!(seq::LongSequence)
         seq[i], seq[j] = seq[j], seq[i]
     end
     return seq
-end
+end
diff --git a/test/alphabet.jl b/test/alphabet.jl
@@ -125,7 +125,6 @@ end
         for nt in BioSymbols.alphabet(DNA)
             @test encode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
         end
-        @test_throws EncodeError encode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000))
     end
 
     @testset "RNA" begin
@@ -142,7 +141,6 @@ end
         for nt in BioSymbols.alphabet(RNA)
             @test encode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
         end
-        @test_throws EncodeError encode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000))
     end
 
     @testset "AminoAcid" begin