Skip to content

Commit

Permalink
Optimise some LongSequence constructors (#320)
Browse files Browse the repository at this point in the history
Specifically, constructors of `LongSequence{<:NucleicAcidAlphabet}` from
`LongSequence` or `LongSubSeq` of `NucleicAcidAlphabet`, where the alphabet
bit size is different.
Simple benchmarks suggests an improvement of 4-7x for long sequences.
  • Loading branch information
jakobnissen authored Oct 22, 2024
1 parent 2933ffb commit 0047647
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 2 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [3.4.0]
* Deprecate functions `n_ambiguous`, `n_gaps` and `n_certain`. Instead, use the
equivalent methods `count(f, seq)` with the appropriate function `f`.
* Deprecate method `Base.count(::Function, ::BioSequence, ::BioSequence)`, and
the other methods of `count` which are subtypes of this.
* Deprecate use of functions `matches` and `mismatches` where the input seqs
have different lengths.
* Optimise `count(==(biosymbol), biosequence)` and `count(==(biosymbol),
biosequence)`
* Optimise contruction of `LongSequence` nucleotide sequences from sequences
with a different bit-number (e.g. two-bit seqs from four-bit seqs)

## [3.3.0]
* Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence
of an unknown alphabet from e.g. a string.
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BioSequences"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
version = "3.3.0"
version = "3.4.0"

[deps]
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
Expand Down
43 changes: 43 additions & 0 deletions src/bit-manipulation/bit-manipulation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,46 @@ end
y = enumerate_nibbles(b) 0x1111111111111111
return count_0000_nibbles(x | y)
end

@inline function four_to_two_bits(x::UInt64)::UInt32
m1 = 0x1111111111111111
m2 = m1 << 1
m3 = m1 | m2
y = (x >> 3) & m1
y |= (x >> 2) & m2
y |= (x >> 1) & m3
pack(y)
end

@inline function pack(x::UInt64)::UInt32
m1 = 0x0f0f0f0f0f0f0f0f
m2 = 0x00ff00ff00ff00ff
m3 = 0x0000ffff0000ffff
m4 = 0x00000000ffffffff
x = (x & m1) | (x & ~m1) >> 2
x = (x & m2) | (x & ~m2) >> 4
x = (x & m3) | (x & ~m3) >> 8
x = (x & m4) | (x & ~m4) >> 16
x % UInt32
end

@inline function two_to_four_bits(x::UInt32)::UInt64
m = 0x1111111111111111
y = expand(x)
z = (y & m) << 1 | (m & ~y)
m2 = y & (m << 1)
m2 = m2 | m2 >> 1
(z & m2) << 2 | (z & ~m2)
end

@inline function expand(x::UInt32)::UInt64
m1 = 0x000000000000ffff
m2 = 0x000000ff000000ff
m3 = 0x000f000f000f000f
m4 = 0x0303030303030303
y = x % UInt64
y = (y & m1) | (y & ~m1) << 16
y = (y & m2) | (y & ~m2) << 8
y = (y & m3) | (y & ~m3) << 4
(y & m4) | (y & ~m4) << 2
end
2 changes: 1 addition & 1 deletion src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ LongSequence{A}(seq::LongSequence{A}) where {A <: Alphabet} = copy(seq)

function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where
{N, T<:LongSequence{<:NucleicAcidAlphabet{N}}}
return T(copy(seq.data), seq.len)
return T(seq.data[1:seq_data_len(seq)], seq.len)
end

# Constructors from strings
Expand Down
52 changes: 52 additions & 0 deletions src/longsequences/seqview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,58 @@ LongSubSeq(seq::BioSequence{A}) where A = LongSubSeq{A}(seq)

Base.view(seq::SeqOrView, part::AbstractUnitRange) = LongSubSeq(seq, part)

function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{2}}) where
{T<:LongSequence{<:NucleicAcidAlphabet{4}}}
res = T(undef, length(seq))
v = res.data
(it, (ch, rm)) = iter_chunks(seq)
i = 1
for chunk in it
@inbounds v[i] = two_to_four_bits(chunk % UInt32)
@inbounds v[i + 1] = two_to_four_bits((chunk >> 32) % UInt32)
i += 2
end
mask = UInt64(1) << (rm & 63) - 1
ch &= mask
rm = Int(rm)
while rm > 0
@inbounds v[i] = two_to_four_bits(ch % UInt32)
ch >>= 32
rm -= 32
i += 1
end
res
end

function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{4}}) where
{T<:LongSequence{<:NucleicAcidAlphabet{2}}}
# Throw an error if the sequence contains gaps or ambiguous nucleotides.
if count(iscertain, seq) != length(seq)
throw(EncodeError(Alphabet(T), first(Iterators.filter(!iscertain, seq))))
end
res = T(undef, length(seq))
v = res.data
(it, (ch, rm)) = iter_chunks(seq)
i = 1
new_chunk = zero(UInt)
shift = 0
for chunk in it
new_chunk |= (four_to_two_bits(chunk) % UInt64) << (shift & 63)
shift ⊻= 32
if iszero(shift)
@inbounds v[i] = new_chunk
new_chunk ⊻= new_chunk
i += 1
end
end
if !iszero(rm) || !iszero(shift)
mask = UInt64(1) << (rm & 63) - 1
new_chunk |= (four_to_two_bits(ch & mask) % UInt64) << (shift & 63)
@inbounds v[i] = new_chunk
end
res
end

# Conversion
function LongSequence(s::LongSubSeq{A}) where A
_copy_seqview(LongSequence{A}, s)
Expand Down

2 comments on commit 0047647

@jakobnissen
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/117837

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v3.4.0 -m "<description of version>" 00476478478bbfc64b09bf0305997820107eefcd
git push origin v3.4.0

Please sign in to comment.