Skip to content

Commit

Permalink
Optimise some LongSequence constructors
Browse files Browse the repository at this point in the history
Specifically, constructors of `LongSequence{<:NucleicAcidAlphabet}` from
`LongSequence` or `LongSubSeq` of `NucleicAcidAlphabet`, where the alphabet
bit size is different.
Simple benchmarks suggests an improvement of 4-7x for long sequences.
  • Loading branch information
jakobnissen committed Oct 22, 2024
1 parent 2933ffb commit a5ec1b3
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 2 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [3.4.0]
* Deprecate functions `n_ambiguous`, `n_gaps` and `n_certain`. Instead, use the
equivalent methods `count(f, seq)` with the appropriate function `f`.
* Deprecate method `Base.count(::Function, ::BioSequence, ::BioSequence)`, and
the other methods of `count` which are subtypes of this.
* Deprecate use of functions `matches` and `mismatches` where the input seqs
have different lengths.
* Optimise `count(==(biosymbol), biosequence)` and `count(==(biosymbol),
biosequence)`
* Optimise contruction of `LongSequence` nucleotide sequences from sequences
with a different bit-number (e.g. two-bit seqs from four-bit seqs)

## [3.3.0]
* Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence
of an unknown alphabet from e.g. a string.
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BioSequences"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
version = "3.3.0"
version = "3.4.0"

[deps]
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
Expand Down
43 changes: 43 additions & 0 deletions src/bit-manipulation/bit-manipulation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,46 @@ end
y = enumerate_nibbles(b) 0x1111111111111111
return count_0000_nibbles(x | y)
end

@inline function four_to_two_bits(x::UInt64)::UInt32
m1 = 0x1111111111111111
m2 = m1 << 1
m3 = m1 | m2
y = (x >> 3) & m1
y |= (x >> 2) & m2
y |= (x >> 1) & m3
pack(y)
end

@inline function pack(x::UInt64)::UInt32
m1 = 0x0f0f0f0f0f0f0f0f
m2 = 0x00ff00ff00ff00ff
m3 = 0x0000ffff0000ffff
m4 = 0x00000000ffffffff
x = (x & m1) | (x & ~m1) >> 2
x = (x & m2) | (x & ~m2) >> 4
x = (x & m3) | (x & ~m3) >> 8
x = (x & m4) | (x & ~m4) >> 16
x % UInt32
end

@inline function two_to_four_bits(x::UInt32)::UInt64
m = 0x1111111111111111
y = expand(x)
z = (y & m) << 1 | (m & ~y)
m2 = y & (m << 1)
m2 = m2 | m2 >> 1
(z & m2) << 2 | (z & ~m2)
end

@inline function expand(x::UInt32)::UInt64
m1 = 0x000000000000ffff
m2 = 0x000000ff000000ff
m3 = 0x000f000f000f000f
m4 = 0x0303030303030303
y = x % UInt64
y = (y & m1) | (y & ~m1) << 16
y = (y & m2) | (y & ~m2) << 8
y = (y & m3) | (y & ~m3) << 4
(y & m4) | (y & ~m4) << 2
end
2 changes: 1 addition & 1 deletion src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ LongSequence{A}(seq::LongSequence{A}) where {A <: Alphabet} = copy(seq)

function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where
{N, T<:LongSequence{<:NucleicAcidAlphabet{N}}}
return T(copy(seq.data), seq.len)
return T(seq.data[1:seq_data_len(seq)], seq.len)
end

# Constructors from strings
Expand Down
52 changes: 52 additions & 0 deletions src/longsequences/seqview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,58 @@ LongSubSeq(seq::BioSequence{A}) where A = LongSubSeq{A}(seq)

Base.view(seq::SeqOrView, part::AbstractUnitRange) = LongSubSeq(seq, part)

function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{2}}) where
{T<:LongSequence{<:NucleicAcidAlphabet{4}}}
res = T(undef, length(seq))
v = res.data
(it, (ch, rm)) = iter_chunks(seq)
i = 1
for chunk in it
@inbounds v[i] = two_to_four_bits(chunk % UInt32)
@inbounds v[i + 1] = two_to_four_bits((chunk >> 32) % UInt32)
i += 2
end
mask = UInt64(1) << (rm & 63) - 1
ch &= mask
rm = Int(rm)
while rm > 0
@inbounds v[i] = two_to_four_bits(ch % UInt32)
ch >>= 32
rm -= 32
i += 1
end
res

Check warning on line 95 in src/longsequences/seqview.jl

View check run for this annotation

Codecov / codecov/patch

src/longsequences/seqview.jl#L95

Added line #L95 was not covered by tests
end

function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{4}}) where
{T<:LongSequence{<:NucleicAcidAlphabet{2}}}
# Throw an error if the sequence contains gaps or ambiguous nucleotides.
if count(iscertain, seq) != length(seq)
throw(EncodeError(Alphabet(T), first(Iterators.filter(!iscertain, seq))))
end
res = T(undef, length(seq))
v = res.data
(it, (ch, rm)) = iter_chunks(seq)
i = 1
new_chunk = zero(UInt)
shift = 0
for chunk in it
new_chunk |= (four_to_two_bits(chunk) % UInt64) << (shift & 63)
shift ⊻= 32
if iszero(shift)
@inbounds v[i] = new_chunk
new_chunk ⊻= new_chunk
i += 1
end
end
if !iszero(rm) || !iszero(shift)
mask = UInt64(1) << (rm & 63) - 1
new_chunk |= (four_to_two_bits(ch & mask) % UInt64) << (shift & 63)
@inbounds v[i] = new_chunk
end
res

Check warning on line 124 in src/longsequences/seqview.jl

View check run for this annotation

Codecov / codecov/patch

src/longsequences/seqview.jl#L124

Added line #L124 was not covered by tests
end

# Conversion
function LongSequence(s::LongSubSeq{A}) where A
_copy_seqview(LongSequence{A}, s)
Expand Down

0 comments on commit a5ec1b3

Please sign in to comment.