Optimise some LongSequence constructors

Specifically, constructors of `LongSequence{<:NucleicAcidAlphabet}` from `LongSequence` or `LongSubSeq` of `NucleicAcidAlphabet`, where the alphabet bit size is different. Simple benchmarks suggests an improvement of 4-7x for long sequences.
BioJulia · Oct 22, 2024 · a5ec1b3 · a5ec1b3
1 parent 2933ffb
commit a5ec1b3
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
+## [3.4.0]
+* Deprecate functions `n_ambiguous`, `n_gaps` and `n_certain`. Instead, use the
+  equivalent methods `count(f, seq)` with the appropriate function `f`.
+* Deprecate method `Base.count(::Function, ::BioSequence, ::BioSequence)`, and
+  the other methods of `count` which are subtypes of this.
+* Deprecate use of functions `matches` and `mismatches` where the input seqs
+  have different lengths.
+* Optimise `count(==(biosymbol), biosequence)` and `count(==(biosymbol),
+  biosequence)`
+* Optimise contruction of `LongSequence` nucleotide sequences from sequences
+  with a different bit-number (e.g. two-bit seqs from four-bit seqs)
+
 ## [3.3.0]
 * Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence
   of an unknown alphabet from e.g. a string.

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioSequences"
 uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
-version = "3.3.0"
+version = "3.4.0"
 
 [deps]
 BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"

diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl
@@ -96,3 +96,46 @@ end
     y = enumerate_nibbles(b) ⊻ 0x1111111111111111
     return count_0000_nibbles(x | y)
 end
+
+@inline function four_to_two_bits(x::UInt64)::UInt32
+    m1 = 0x1111111111111111
+    m2 = m1 << 1
+    m3 = m1 | m2
+    y  = (x >> 3) & m1
+    y |= (x >> 2) & m2
+    y |= (x >> 1) & m3
+    pack(y)
+end
+
+@inline function pack(x::UInt64)::UInt32
+    m1 = 0x0f0f0f0f0f0f0f0f
+    m2 = 0x00ff00ff00ff00ff
+    m3 = 0x0000ffff0000ffff
+    m4 = 0x00000000ffffffff
+    x = (x & m1) | (x & ~m1) >> 2
+    x = (x & m2) | (x & ~m2) >> 4
+    x = (x & m3) | (x & ~m3) >> 8
+    x = (x & m4) | (x & ~m4) >> 16
+    x % UInt32
+end
+
+@inline function two_to_four_bits(x::UInt32)::UInt64
+    m = 0x1111111111111111
+    y = expand(x)
+    z = (y & m) << 1 | (m & ~y)
+    m2 = y & (m << 1)
+    m2 = m2 | m2 >> 1
+    (z & m2) << 2 | (z & ~m2)
+end
+
+@inline function expand(x::UInt32)::UInt64
+    m1 = 0x000000000000ffff
+    m2 = 0x000000ff000000ff
+    m3 = 0x000f000f000f000f
+    m4 = 0x0303030303030303
+    y = x % UInt64
+    y = (y & m1) | (y & ~m1) << 16
+    y = (y & m2) | (y & ~m2) << 8
+    y = (y & m3) | (y & ~m3) << 4
+        (y & m4) | (y & ~m4) << 2
+end
diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -56,7 +56,7 @@ LongSequence{A}(seq::LongSequence{A}) where {A <: Alphabet} = copy(seq)
 
 function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where
          {N, T<:LongSequence{<:NucleicAcidAlphabet{N}}}
-    return T(copy(seq.data), seq.len)
+    return T(seq.data[1:seq_data_len(seq)], seq.len)
 end
 
 # Constructors from strings

diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl
@@ -72,6 +72,58 @@ LongSubSeq(seq::BioSequence{A}) where A = LongSubSeq{A}(seq)
 
 Base.view(seq::SeqOrView, part::AbstractUnitRange) = LongSubSeq(seq, part)
 
+function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{2}}) where
+         {T<:LongSequence{<:NucleicAcidAlphabet{4}}}
+    res = T(undef, length(seq))
+    v = res.data
+    (it, (ch, rm)) = iter_chunks(seq)
+    i = 1
+    for chunk in it
+        @inbounds v[i] = two_to_four_bits(chunk % UInt32)
+        @inbounds v[i + 1] = two_to_four_bits((chunk >> 32) % UInt32)
+        i += 2
+    end
+    mask = UInt64(1) << (rm & 63) - 1
+    ch &= mask
+    rm = Int(rm)
+    while rm > 0
+        @inbounds v[i] = two_to_four_bits(ch % UInt32)
+        ch >>= 32
+        rm -= 32
+        i += 1
+    end
+    res
+end
+
+function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{4}}) where
+         {T<:LongSequence{<:NucleicAcidAlphabet{2}}}
+    # Throw an error if the sequence contains gaps or ambiguous nucleotides.
+    if count(iscertain, seq) != length(seq)
+        throw(EncodeError(Alphabet(T), first(Iterators.filter(!iscertain, seq))))
+    end
+    res = T(undef, length(seq))
+    v = res.data
+    (it, (ch, rm)) = iter_chunks(seq)
+    i = 1
+    new_chunk = zero(UInt)
+    shift = 0
+    for chunk in it
+        new_chunk |= (four_to_two_bits(chunk) % UInt64) << (shift & 63)
+        shift ⊻= 32
+        if iszero(shift)
+            @inbounds v[i] = new_chunk
+            new_chunk ⊻= new_chunk
+            i += 1
+        end
+    end
+    if !iszero(rm) || !iszero(shift)
+        mask = UInt64(1) << (rm & 63) - 1
+        new_chunk |= (four_to_two_bits(ch & mask) % UInt64) << (shift & 63)
+        @inbounds v[i] = new_chunk
+    end
+    res
+end
+
 # Conversion
 function LongSequence(s::LongSubSeq{A}) where A
 	_copy_seqview(LongSequence{A}, s)