diff --git a/CHANGELOG.md b/CHANGELOG.md index 0be93ea0..a7c91b71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [3.4.0] +* Deprecate functions `n_ambiguous`, `n_gaps` and `n_certain`. Instead, use the + equivalent methods `count(f, seq)` with the appropriate function `f`. +* Deprecate method `Base.count(::Function, ::BioSequence, ::BioSequence)`, and + the other methods of `count` which are subtypes of this. +* Deprecate use of functions `matches` and `mismatches` where the input seqs + have different lengths. +* Optimise `count(==(biosymbol), biosequence)` and `count(==(biosymbol), + biosequence)` +* Optimise contruction of `LongSequence` nucleotide sequences from sequences + with a different bit-number (e.g. two-bit seqs from four-bit seqs) + ## [3.3.0] * Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence of an unknown alphabet from e.g. a string. diff --git a/Project.toml b/Project.toml index 22a1e74f..ae562aeb 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BioSequences" uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" authors = ["Sabrina Jaye Ward ", "Jakob Nissen "] -version = "3.3.0" +version = "3.4.0" [deps] BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl index 12a24009..e40c6e1e 100644 --- a/src/bit-manipulation/bit-manipulation.jl +++ b/src/bit-manipulation/bit-manipulation.jl @@ -96,3 +96,46 @@ end y = enumerate_nibbles(b) ⊻ 0x1111111111111111 return count_0000_nibbles(x | y) end + +@inline function four_to_two_bits(x::UInt64)::UInt32 + m1 = 0x1111111111111111 + m2 = m1 << 1 + m3 = m1 | m2 + y = (x >> 3) & m1 + y |= (x >> 2) & m2 + y |= (x >> 1) & m3 + pack(y) +end + +@inline function pack(x::UInt64)::UInt32 + m1 = 0x0f0f0f0f0f0f0f0f + m2 = 0x00ff00ff00ff00ff + m3 = 0x0000ffff0000ffff + m4 = 0x00000000ffffffff + x = (x & m1) | (x & ~m1) >> 2 + x = (x & m2) | (x & ~m2) >> 4 + x = (x & m3) | (x & ~m3) >> 8 + x = (x & m4) | (x & ~m4) >> 16 + x % UInt32 +end + +@inline function two_to_four_bits(x::UInt32)::UInt64 + m = 0x1111111111111111 + y = expand(x) + z = (y & m) << 1 | (m & ~y) + m2 = y & (m << 1) + m2 = m2 | m2 >> 1 + (z & m2) << 2 | (z & ~m2) +end + +@inline function expand(x::UInt32)::UInt64 + m1 = 0x000000000000ffff + m2 = 0x000000ff000000ff + m3 = 0x000f000f000f000f + m4 = 0x0303030303030303 + y = x % UInt64 + y = (y & m1) | (y & ~m1) << 16 + y = (y & m2) | (y & ~m2) << 8 + y = (y & m3) | (y & ~m3) << 4 + (y & m4) | (y & ~m4) << 2 +end diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index 03c1b15c..f3cbde9e 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -56,7 +56,7 @@ LongSequence{A}(seq::LongSequence{A}) where {A <: Alphabet} = copy(seq) function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where {N, T<:LongSequence{<:NucleicAcidAlphabet{N}}} - return T(copy(seq.data), seq.len) + return T(seq.data[1:seq_data_len(seq)], seq.len) end # Constructors from strings diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl index 6f17dc31..014bcda5 100644 --- a/src/longsequences/seqview.jl +++ b/src/longsequences/seqview.jl @@ -72,6 +72,58 @@ LongSubSeq(seq::BioSequence{A}) where A = LongSubSeq{A}(seq) Base.view(seq::SeqOrView, part::AbstractUnitRange) = LongSubSeq(seq, part) +function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{2}}) where + {T<:LongSequence{<:NucleicAcidAlphabet{4}}} + res = T(undef, length(seq)) + v = res.data + (it, (ch, rm)) = iter_chunks(seq) + i = 1 + for chunk in it + @inbounds v[i] = two_to_four_bits(chunk % UInt32) + @inbounds v[i + 1] = two_to_four_bits((chunk >> 32) % UInt32) + i += 2 + end + mask = UInt64(1) << (rm & 63) - 1 + ch &= mask + rm = Int(rm) + while rm > 0 + @inbounds v[i] = two_to_four_bits(ch % UInt32) + ch >>= 32 + rm -= 32 + i += 1 + end + res +end + +function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{4}}) where + {T<:LongSequence{<:NucleicAcidAlphabet{2}}} + # Throw an error if the sequence contains gaps or ambiguous nucleotides. + if count(iscertain, seq) != length(seq) + throw(EncodeError(Alphabet(T), first(Iterators.filter(!iscertain, seq)))) + end + res = T(undef, length(seq)) + v = res.data + (it, (ch, rm)) = iter_chunks(seq) + i = 1 + new_chunk = zero(UInt) + shift = 0 + for chunk in it + new_chunk |= (four_to_two_bits(chunk) % UInt64) << (shift & 63) + shift ⊻= 32 + if iszero(shift) + @inbounds v[i] = new_chunk + new_chunk ⊻= new_chunk + i += 1 + end + end + if !iszero(rm) || !iszero(shift) + mask = UInt64(1) << (rm & 63) - 1 + new_chunk |= (four_to_two_bits(ch & mask) % UInt64) << (shift & 63) + @inbounds v[i] = new_chunk + end + res +end + # Conversion function LongSequence(s::LongSubSeq{A}) where A _copy_seqview(LongSequence{A}, s)