Skip to content

Commit

Permalink
Various internal code cleanup (#323)
Browse files Browse the repository at this point in the history
* Remove some unused, internal functions
* Use `trailing_zeros` instead of LUT for 4->2 bit transcoding
* Remove validity check when encoding, since invalid biosymbols should never
  exist
* Remove some small source code files, and move their content to other files
* Optimise a commonly used bitindex method
* Optimise equality for SeqOrView of identical alphabets
  • Loading branch information
jakobnissen authored Oct 24, 2024
1 parent 0047647 commit 8fa5616
Show file tree
Hide file tree
Showing 12 changed files with 61 additions and 125 deletions.
16 changes: 3 additions & 13 deletions src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,24 +171,17 @@ BitsPerSymbol(::A) where A <: NucleicAcidAlphabet{4} = BitsPerSymbol{4}()


## Encoding and decoding DNA and RNA alphabet symbols

# A nucleotide with bitvalue B has kmer-bitvalue kmerbits[B+1].
# ambiguous nucleotides have no kmervalue, here set to 0xff
const twobitnucs = (0xff, 0x00, 0x01, 0xff,
0x02, 0xff, 0xff, 0xff,
0x03, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff)

for A in (DNAAlphabet, RNAAlphabet)
T = eltype(A)
@eval begin

# 2-bit encoding
@inline function encode(::$(A){2}, nt::$(T))
if count_ones(nt) != 1 || !isvalid(nt)
u = reinterpret(UInt8, nt)
if count_ones(u) != 1
throw(EncodeError($(A){2}(), nt))
end
return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01])
trailing_zeros(u) % UInt
end

@inline function decode(::$(A){2}, x::UInt)
Expand All @@ -199,9 +192,6 @@ for A in (DNAAlphabet, RNAAlphabet)

# 4-bit encoding
@inline function encode(::$(A){4}, nt::$(T))
if !isvalid(nt)
throw(EncodeError($(A){4}(), nt))
end
return convert(UInt, reinterpret(UInt8, nt))
end

Expand Down
15 changes: 14 additions & 1 deletion src/biosequence/biosequence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ For compatibility with existing `Alphabet`s, the encoded data eltype must be `UI
"""
abstract type BioSequence{A<:Alphabet} end

function (::Type{S})(seq::BioSequence) where {S <: AbstractString}
_string(S, seq, codetype(Alphabet(seq)))
end

Base.LazyString(seq::BioSequence) = LazyString(string(seq))

function _string(::Type{S}, seq::BioSequence, ::AlphabetCode) where {S<:AbstractString}
return S([Char(x) for x in seq])
end

function _string(::Type{String}, seq::BioSequence, ::AsciiAlphabet)
String([stringbyte(s) for s in seq])
end

"""
has_interface(::Type{BioSequence}, ::T, syms::Vector, mutable::Bool, compat::Bool=true)
Expand Down Expand Up @@ -230,7 +244,6 @@ const AASeq = BioSequence{AminoAcidAlphabet}

# The generic functions for any BioSequence...
include("indexing.jl")
include("conversion.jl")
include("predicates.jl")
include("find.jl")
include("printing.jl")
Expand Down
24 changes: 0 additions & 24 deletions src/biosequence/conversion.jl

This file was deleted.

40 changes: 0 additions & 40 deletions src/bit-manipulation/bit-manipulation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,6 @@ end
return count_ones((c | g) & ~(a | t))
end

@inline a_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_00_bitpairs(x)
@inline c_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_01_bitpairs(x)
@inline g_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_10_bitpairs(x)
@inline t_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_11_bitpairs(x)

@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_nonzero_nibbles(a b)
end

@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}}
return count_nonzero_bitpairs(a b)
end

@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_0000_nibbles(a b)
end

@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}}
return count_00_bitpairs(a b)
end

@inline function ambiguous_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_nonzero_nibbles(enumerate_nibbles(x) & 0xEEEEEEEEEEEEEEEE)
end
Expand All @@ -67,19 +46,6 @@ end
return count_compared_bytes(i -> (i > 0x15) & (i < 0x1a), x)
end

@inline function ambiguous_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_nonzero_nibbles((enumerate_nibbles(a) | enumerate_nibbles(b)) & 0xEEEEEEEEEEEEEEEE)
end

@inline function gap_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_0000_nibbles(x)
end

@inline function gap_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
# Count the gaps in a, count the gaps in b, subtract the number of shared gaps.
return count_0000_nibbles(a) + count_0000_nibbles(b) - count_0000_nibbles(a | b)
end

@inline function count_compared_bytes(f::F, x::UInt64) where F
z = @inline reinterpret(NTuple{8, VecElement{UInt8}}, x)
@inline sum(map(i -> f(i.value), z))
Expand All @@ -91,12 +57,6 @@ end
return count_0000_nibbles(x)
end

@inline function certain_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
x = enumerate_nibbles(a) 0x1111111111111111
y = enumerate_nibbles(b) 0x1111111111111111
return count_0000_nibbles(x | y)
end

@inline function four_to_two_bits(x::UInt64)::UInt32
m1 = 0x1111111111111111
m2 = m1 << 1
Expand Down
6 changes: 4 additions & 2 deletions src/bit-manipulation/bitindex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@ end
BitsPerSymbol(::BitIndex{N, W}) where {N,W} = BitsPerSymbol{N}()
bits_per_symbol(::BitIndex{N, W}) where {N,W} = N

@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i) where {N,W}
return BitIndex{N, W}((i - 1) << trailing_zeros(N))
bitindex(B::BitsPerSymbol, ::Type{W}, i::Integer) where W = bitindex(B, W, Int(i)::Int)

@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i::Union{Int, UInt}) where {N,W}
return BitIndex{N, W}(((i - 1) % UInt) << trailing_zeros(N))
end

bitwidth(::Type{W}) where W = 8*sizeof(W)
Expand Down
11 changes: 4 additions & 7 deletions src/geneticcode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,10 @@

const XNA = Union{DNA, RNA}
function unambiguous_codon(a::XNA, b::XNA, c::XNA)
@inbounds begin
bits = twobitnucs[reinterpret(UInt8, a) + 0x01] << 4 |
twobitnucs[reinterpret(UInt8, b) + 0x01] << 2 |
twobitnucs[reinterpret(UInt8, c) + 0x01]
end
#reinterpret(RNACodon, bits % UInt64)
return bits % UInt64
bits = (trailing_zeros(reinterpret(UInt8, a)) << 4) |
(trailing_zeros(reinterpret(UInt8, b)) << 2) |
(trailing_zeros(reinterpret(UInt8, c)) << 0)
bits % UInt64
end

# A genetic code is a table mapping RNA 3-mers (i.e. RNAKmer{3}) to AminoAcids.
Expand Down
21 changes: 21 additions & 0 deletions src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,27 @@
### This file is a part of BioJulia.
### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md

###
### Promotion
###
for alph in (DNAAlphabet, RNAAlphabet)
@eval function Base.promote_rule(::Type{LongSequence{A}}, ::Type{LongSequence{B}}) where {A<:$alph,B<:$alph}
return LongSequence{promote_rule(A, B)}
end
end

###
### Conversion
###
Base.convert(::Type{T}, seq::T) where {T <: LongSequence} = seq
Base.convert(::Type{T}, seq::T) where {T <: LongSequence{<:NucleicAcidAlphabet}} = seq

function Base.convert(::Type{T}, seq::LongSequence{<:NucleicAcidAlphabet}) where
{T<:LongSequence{<:NucleicAcidAlphabet}}
return T(seq)
end


@inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s))

@inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet
Expand Down
28 changes: 0 additions & 28 deletions src/longsequences/conversion.jl

This file was deleted.

1 change: 0 additions & 1 deletion src/longsequences/longsequence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ include("seqview.jl")
include("chunk_iterator.jl")
include("indexing.jl")
include("constructors.jl")
include("conversion.jl")
include("copying.jl")
include("stringliterals.jl")
include("transformations.jl")
Expand Down
11 changes: 8 additions & 3 deletions src/longsequences/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,15 +201,20 @@ function Base.:(==)(seq1::SeqOrView{A}, seq2::SeqOrView{A}) where {A <: Alphabet
length(seq1) == length(seq2) || return false

# If they share the same data
isempty(seq1) && return true
if seq1.data === seq2.data && firstbitindex(seq1) == firstbitindex(seq2)
return true
end

# Fallback
for (i, j) in zip(seq1, seq2)
# Check all filled UInts
(it, (ch1, ch2, rem)) = iter_chunks(seq1, seq2)
for (i, j) in it
i == j || return false
end
return true

# Check last coding UInt (or compare two zeros, if none)
mask = UInt64(1) << (rem & 63) - 1
return (ch1 & mask) == (ch2 & mask)
end

function Base.:(==)(seq1::LongSequence{A}, seq2::LongSequence{A}) where {A <: Alphabet}
Expand Down
11 changes: 7 additions & 4 deletions src/longsequences/transformations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
###

"""
resize!(seq, size, [force::Bool])
resize!(seq, size, [force::Bool=false])
Resize a biological sequence `seq`, to a given `size`. Does not resize the underlying data
array unless the new size does not fit. If `force`, always resize underlying data array.
Note that resizing to a larger size, and then loading from uninitialized positions
is not allowed and may cause undefined behaviour.
Make sure to always fill any uninitialized biosymbols after resizing.
"""
function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A}
if size < 0
Expand Down Expand Up @@ -67,9 +71,8 @@ end
# all chunks by up to 63 bits.
# This is written so it SIMD parallelizes - careful with changes
@inline function zero_offset!(seq::LongSequence{A}) where A <: Alphabet
isempty(seq) && return seq
offs = (64 - offset(bitindex(seq, length(seq)) + bits_per_symbol(A()))) % UInt
zero_offset!(seq, offs)
zero_offset!(seq, offs)
end

@inline function zero_offset!(seq::LongSequence{A}, offs::UInt) where A <: Alphabet
Expand Down Expand Up @@ -165,4 +168,4 @@ function Random.shuffle!(seq::LongSequence)
seq[i], seq[j] = seq[j], seq[i]
end
return seq
end
end
2 changes: 0 additions & 2 deletions test/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ end
for nt in BioSymbols.alphabet(DNA)
@test encode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
end
@test_throws EncodeError encode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000))
end

@testset "RNA" begin
Expand All @@ -142,7 +141,6 @@ end
for nt in BioSymbols.alphabet(RNA)
@test encode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
end
@test_throws EncodeError encode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000))
end

@testset "AminoAcid" begin
Expand Down

0 comments on commit 8fa5616

Please sign in to comment.