Skip to content

Commit

Permalink
Kmers.jl compatibility (#282)
Browse files Browse the repository at this point in the history
Add various small improvements to BioSequences to make it compatible with the
upcoming Kmers.jl release.
All of these changes are improved internals.

* Improve showerror for EncodeError
* Improve test in has_interface
* Add generic BioSequence methods for bits_per_symbol, firstbitindex and
  lastbitindex
* Add some more reversebits methods for different bits per symbol sizes
  • Loading branch information
jakobnissen authored Dec 22, 2024
1 parent 9a3f893 commit e1918ce
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BioSequences"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
version = "3.4.0"
version = "3.4.1"

[deps]
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
Expand Down
2 changes: 1 addition & 1 deletion docs/src/construction.md
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ If the input cannot be encoded by any of the built-in alphabets, an error is thr

```jldoctest
julia> bioseq("0!(CC!;#&&%")
ERROR: cannot encode 0x30 in AminoAcidAlphabet
ERROR: cannot encode 0x30 (Char '0') in AminoAcidAlphabet
[...]
```

Expand Down
10 changes: 9 additions & 1 deletion src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,15 @@ end
EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val)

function Base.showerror(io::IO, err::EncodeError{A}) where {A}
print(io, "cannot encode ", repr(err.val), " in ", A)
val = err.val
char_repr = if val isa Integer && val < 0x80
repr(val) * " (Char '" * Char(val) * "')"
elseif val isa Union{AbstractString, AbstractChar}
repr(val)
else
string(err.val)
end
print(io, "cannot encode " * char_repr * " in ", A)
end

"""
Expand Down
5 changes: 3 additions & 2 deletions src/biosequence/biosequence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ function has_interface(
isempty(syms) && error("Vector syms must not be empty")
first(syms) isa eltype(T) || error("Vector is of wrong element type")
seq = T((i for i in syms))
length(seq) > 0 || return false
length(seq) == length(syms) || return false
eachindex(seq) === Base.OneTo(length(seq)) || return false
E = encoded_data_eltype(T)
e = extract_encoded_element(seq, 1)
Expand All @@ -101,13 +101,14 @@ Base.nextind(::BioSequence, i::Integer) = Int(i) + 1
Base.prevind(::BioSequence, i::Integer) = Int(i) - 1
Base.size(x::BioSequence) = (length(x),)
Base.eltype(::Type{<:BioSequence{A}}) where {A <: Alphabet} = eltype(A)
Base.eltype(x::BioSequence) = eltype(typeof(x))
Alphabet(::Type{<:BioSequence{A}}) where {A <: Alphabet} = A()
Alphabet(x::BioSequence) = Alphabet(typeof(x))
Base.isempty(x::BioSequence) = iszero(length(x))
Base.empty(::Type{T}) where {T <: BioSequence} = T(eltype(T)[])
Base.empty(x::BioSequence) = empty(typeof(x))
BitsPerSymbol(x::BioSequence) = BitsPerSymbol(Alphabet(typeof(x)))
bits_per_symbol(::Type{T}) where {T <: BioSequence} = bits_per_symbol(Alphabet(T))
bits_per_symbol(x::BioSequence) = bits_per_symbol(typeof(x))
Base.hash(s::BioSequence, x::UInt) = foldl((a, b) -> hash(b, a), s, init=x)

function Base.similar(seq::BioSequence, len::Integer=length(seq))
Expand Down
3 changes: 3 additions & 0 deletions src/biosequence/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
(i % UInt) - 1 < (lastindex(seq) % UInt) ? (@inbounds seq[i], i + 1) : nothing
end

lastbitindex(x::BioSequence) = bitindex(x, lastindex(x))
firstbitindex(x::BioSequence) = bitindex(x, firstindex(x))

## Bounds checking
function Base.checkbounds(x::BioSequence, i::Integer)
firstindex(x) i lastindex(x) || throw(BoundsError(x, i))
Expand Down
26 changes: 22 additions & 4 deletions src/bit-manipulation/bit-manipulation.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: Base.BitUnsigned
const BitUnsigned = Union{UInt8, UInt16, UInt32, UInt64, UInt128}

@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: BitUnsigned
mask = 0x33333333333333333333333333333333 % T
x = ((x >> 2) & mask) | ((x & mask) << 2)
return reversebits(x, BitsPerSymbol{4}())
end

@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: Base.BitUnsigned
@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: BitUnsigned
mask = 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F % T
x = ((x >> 4) & mask) | ((x & mask) << 4)
return bswap(x)
return reversebits(x, BitsPerSymbol{8}())
end

@inline reversebits(x::T, ::BitsPerSymbol{8}) where T <: BitUnsigned = bswap(x)

@inline reversebits(x::UInt16, ::BitsPerSymbol{16}) = x
@inline function reversebits(x::T, ::BitsPerSymbol{16}) where T <: Union{UInt32, UInt64}
mask = 0x0000FFFF0000FFFF0000FFFF0000FFFF % T
x = ((x >> 16) & mask) | ((x & mask) << 16)
reversebits(x, BitsPerSymbol{32}())
end

@inline reversebits(x::UInt32, ::BitsPerSymbol{32}) = x
@inline function reversebits(x::T, ::BitsPerSymbol{32}) where T <: Union{UInt64}
mask = 0x00000000FFFFFFF00000000FFFFFFFF % T
x = ((x >> 32) & mask) | ((x & mask) << 32)
reversebits(x, BitsPerSymbol{64}())
end

reversebits(x::T, ::BitsPerSymbol{8}) where T <: Base.BitUnsigned = bswap(x)
@inline reversebits(x::UInt64, ::BitsPerSymbol{64}) = x

@inline function complement_bitpar(x::Unsigned, ::T) where {T<:NucleicAcidAlphabet{2}}
return ~x
Expand Down
2 changes: 1 addition & 1 deletion src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ julia> bioseq("UAUGCUGUAGG")
UAUGCUGUAGG
julia> bioseq("PKMW#3>>0;kL")
ERROR: cannot encode 0x23 in AminoAcidAlphabet
ERROR: cannot encode 0x23 (Char '#') in AminoAcidAlphabet
[...]
```
"""
Expand Down
3 changes: 0 additions & 3 deletions src/longsequences/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
bitindex(N, encoded_data_eltype(typeof(x)), i)
end

firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s) % UInt)
lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s) % UInt)

@inline function extract_encoded_element(x::SeqOrView, i::Integer)
bi = bitindex(x, i % UInt)
extract_encoded_element(bi, x.data)
Expand Down
17 changes: 17 additions & 0 deletions test/biosequences/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,21 @@ end
@test ungap(seq) == seq
cp = copy(seq)
@test ungap!(seq) == cp
end

# Ideally, we'd not test this internal function, but instead the code that
# relies on this, but this is only called if you have custom alphabets, and
# creating these alphabets of different sizes is a hassle
@testset "bitreverse" begin
bps8 = BioSequences.BitsPerSymbol{8}()
bps16 = BioSequences.BitsPerSymbol{16}()
bps32 = BioSequences.BitsPerSymbol{32}()
bps64 = BioSequences.BitsPerSymbol{64}()
reversebits = BioSequences.reversebits
@test reversebits(0x0102, bps16) === 0x0102
@test reversebits(0x01020304, bps16) === 0x03040102
@test reversebits(0x0102030405060708, bps16) === 0x0708050603040102
@test reversebits(0x01020304, bps32) === 0x01020304
@test reversebits(0x0102030405060708, bps32) === 0x0506070801020304
@test reversebits(0x0102030405060708, bps64) === 0x0102030405060708
end

2 comments on commit e1918ce

@jakobnissen
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/121844

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v3.4.1 -m "<description of version>" e1918ce3ed8fc8ac7fbc8f81dd9072e764f40644
git push origin v3.4.1

Please sign in to comment.