Skip to content

Commit

Permalink
Kmers.jl compatibility (BioJulia#282)
Browse files Browse the repository at this point in the history
Add various small improvements to BioSequences to make it compatible with the
upcoming Kmers.jl release.
All of these changes are improved internals.

* Improve showerror for EncodeError
* Improve test in has_interface
* Add generic BioSequence methods for bits_per_symbol, firstbitindex and
  lastbitindex
* Add some more reversebits methods for different bits per symbol sizes
  • Loading branch information
jakobnissen authored Dec 22, 2024
1 parent 9a3f893 commit e1918ce
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BioSequences"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
version = "3.4.0"
version = "3.4.1"

[deps]
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
Expand Down
2 changes: 1 addition & 1 deletion docs/src/construction.md
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ If the input cannot be encoded by any of the built-in alphabets, an error is thr

```jldoctest
julia> bioseq("0!(CC!;#&&%")
ERROR: cannot encode 0x30 in AminoAcidAlphabet
ERROR: cannot encode 0x30 (Char '0') in AminoAcidAlphabet
[...]
```

Expand Down
10 changes: 9 additions & 1 deletion src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,15 @@ end
EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val)

function Base.showerror(io::IO, err::EncodeError{A}) where {A}
print(io, "cannot encode ", repr(err.val), " in ", A)
val = err.val
char_repr = if val isa Integer && val < 0x80
repr(val) * " (Char '" * Char(val) * "')"
elseif val isa Union{AbstractString, AbstractChar}
repr(val)
else
string(err.val)
end
print(io, "cannot encode " * char_repr * " in ", A)
end

"""
Expand Down
5 changes: 3 additions & 2 deletions src/biosequence/biosequence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ function has_interface(
isempty(syms) && error("Vector syms must not be empty")
first(syms) isa eltype(T) || error("Vector is of wrong element type")
seq = T((i for i in syms))
length(seq) > 0 || return false
length(seq) == length(syms) || return false
eachindex(seq) === Base.OneTo(length(seq)) || return false
E = encoded_data_eltype(T)
e = extract_encoded_element(seq, 1)
Expand All @@ -101,13 +101,14 @@ Base.nextind(::BioSequence, i::Integer) = Int(i) + 1
Base.prevind(::BioSequence, i::Integer) = Int(i) - 1
Base.size(x::BioSequence) = (length(x),)
Base.eltype(::Type{<:BioSequence{A}}) where {A <: Alphabet} = eltype(A)
Base.eltype(x::BioSequence) = eltype(typeof(x))
Alphabet(::Type{<:BioSequence{A}}) where {A <: Alphabet} = A()
Alphabet(x::BioSequence) = Alphabet(typeof(x))
Base.isempty(x::BioSequence) = iszero(length(x))
Base.empty(::Type{T}) where {T <: BioSequence} = T(eltype(T)[])
Base.empty(x::BioSequence) = empty(typeof(x))
BitsPerSymbol(x::BioSequence) = BitsPerSymbol(Alphabet(typeof(x)))
bits_per_symbol(::Type{T}) where {T <: BioSequence} = bits_per_symbol(Alphabet(T))
bits_per_symbol(x::BioSequence) = bits_per_symbol(typeof(x))
Base.hash(s::BioSequence, x::UInt) = foldl((a, b) -> hash(b, a), s, init=x)

function Base.similar(seq::BioSequence, len::Integer=length(seq))
Expand Down
3 changes: 3 additions & 0 deletions src/biosequence/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
(i % UInt) - 1 < (lastindex(seq) % UInt) ? (@inbounds seq[i], i + 1) : nothing
end

lastbitindex(x::BioSequence) = bitindex(x, lastindex(x))
firstbitindex(x::BioSequence) = bitindex(x, firstindex(x))

## Bounds checking
function Base.checkbounds(x::BioSequence, i::Integer)
firstindex(x) i lastindex(x) || throw(BoundsError(x, i))
Expand Down
26 changes: 22 additions & 4 deletions src/bit-manipulation/bit-manipulation.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,34 @@
@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: Base.BitUnsigned
const BitUnsigned = Union{UInt8, UInt16, UInt32, UInt64, UInt128}

@inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: BitUnsigned
mask = 0x33333333333333333333333333333333 % T
x = ((x >> 2) & mask) | ((x & mask) << 2)
return reversebits(x, BitsPerSymbol{4}())
end

@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: Base.BitUnsigned
@inline function reversebits(x::T, ::BitsPerSymbol{4}) where T <: BitUnsigned
mask = 0x0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F0F % T
x = ((x >> 4) & mask) | ((x & mask) << 4)
return bswap(x)
return reversebits(x, BitsPerSymbol{8}())
end

@inline reversebits(x::T, ::BitsPerSymbol{8}) where T <: BitUnsigned = bswap(x)

@inline reversebits(x::UInt16, ::BitsPerSymbol{16}) = x
@inline function reversebits(x::T, ::BitsPerSymbol{16}) where T <: Union{UInt32, UInt64}
mask = 0x0000FFFF0000FFFF0000FFFF0000FFFF % T
x = ((x >> 16) & mask) | ((x & mask) << 16)
reversebits(x, BitsPerSymbol{32}())
end

@inline reversebits(x::UInt32, ::BitsPerSymbol{32}) = x
@inline function reversebits(x::T, ::BitsPerSymbol{32}) where T <: Union{UInt64}
mask = 0x00000000FFFFFFF00000000FFFFFFFF % T
x = ((x >> 32) & mask) | ((x & mask) << 32)
reversebits(x, BitsPerSymbol{64}())
end

reversebits(x::T, ::BitsPerSymbol{8}) where T <: Base.BitUnsigned = bswap(x)
@inline reversebits(x::UInt64, ::BitsPerSymbol{64}) = x

@inline function complement_bitpar(x::Unsigned, ::T) where {T<:NucleicAcidAlphabet{2}}
return ~x
Expand Down
2 changes: 1 addition & 1 deletion src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ julia> bioseq("UAUGCUGUAGG")
UAUGCUGUAGG
julia> bioseq("PKMW#3>>0;kL")
ERROR: cannot encode 0x23 in AminoAcidAlphabet
ERROR: cannot encode 0x23 (Char '#') in AminoAcidAlphabet
[...]
```
"""
Expand Down
3 changes: 0 additions & 3 deletions src/longsequences/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@
bitindex(N, encoded_data_eltype(typeof(x)), i)
end

firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s) % UInt)
lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s) % UInt)

@inline function extract_encoded_element(x::SeqOrView, i::Integer)
bi = bitindex(x, i % UInt)
extract_encoded_element(bi, x.data)
Expand Down
17 changes: 17 additions & 0 deletions test/biosequences/misc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,21 @@ end
@test ungap(seq) == seq
cp = copy(seq)
@test ungap!(seq) == cp
end

# Ideally, we'd not test this internal function, but instead the code that
# relies on this, but this is only called if you have custom alphabets, and
# creating these alphabets of different sizes is a hassle
@testset "bitreverse" begin
bps8 = BioSequences.BitsPerSymbol{8}()
bps16 = BioSequences.BitsPerSymbol{16}()
bps32 = BioSequences.BitsPerSymbol{32}()
bps64 = BioSequences.BitsPerSymbol{64}()
reversebits = BioSequences.reversebits
@test reversebits(0x0102, bps16) === 0x0102
@test reversebits(0x01020304, bps16) === 0x03040102
@test reversebits(0x0102030405060708, bps16) === 0x0708050603040102
@test reversebits(0x01020304, bps32) === 0x01020304
@test reversebits(0x0102030405060708, bps32) === 0x0506070801020304
@test reversebits(0x0102030405060708, bps64) === 0x0102030405060708
end

0 comments on commit e1918ce

Please sign in to comment.