Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various internal code cleanup #323

Merged
merged 1 commit into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -171,24 +171,17 @@ BitsPerSymbol(::A) where A <: NucleicAcidAlphabet{4} = BitsPerSymbol{4}()


## Encoding and decoding DNA and RNA alphabet symbols

# A nucleotide with bitvalue B has kmer-bitvalue kmerbits[B+1].
# ambiguous nucleotides have no kmervalue, here set to 0xff
const twobitnucs = (0xff, 0x00, 0x01, 0xff,
0x02, 0xff, 0xff, 0xff,
0x03, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff)

for A in (DNAAlphabet, RNAAlphabet)
T = eltype(A)
@eval begin

# 2-bit encoding
@inline function encode(::$(A){2}, nt::$(T))
if count_ones(nt) != 1 || !isvalid(nt)
u = reinterpret(UInt8, nt)
if count_ones(u) != 1
throw(EncodeError($(A){2}(), nt))
end
return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01])
trailing_zeros(u) % UInt
end

@inline function decode(::$(A){2}, x::UInt)
Expand All @@ -199,9 +192,6 @@ for A in (DNAAlphabet, RNAAlphabet)

# 4-bit encoding
@inline function encode(::$(A){4}, nt::$(T))
if !isvalid(nt)
throw(EncodeError($(A){4}(), nt))
end
return convert(UInt, reinterpret(UInt8, nt))
end

Expand Down
15 changes: 14 additions & 1 deletion src/biosequence/biosequence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ For compatibility with existing `Alphabet`s, the encoded data eltype must be `UI
"""
abstract type BioSequence{A<:Alphabet} end

function (::Type{S})(seq::BioSequence) where {S <: AbstractString}
_string(S, seq, codetype(Alphabet(seq)))
end

Base.LazyString(seq::BioSequence) = LazyString(string(seq))

function _string(::Type{S}, seq::BioSequence, ::AlphabetCode) where {S<:AbstractString}
return S([Char(x) for x in seq])
end

function _string(::Type{String}, seq::BioSequence, ::AsciiAlphabet)
String([stringbyte(s) for s in seq])
end

"""
has_interface(::Type{BioSequence}, ::T, syms::Vector, mutable::Bool, compat::Bool=true)

Expand Down Expand Up @@ -230,7 +244,6 @@ const AASeq = BioSequence{AminoAcidAlphabet}

# The generic functions for any BioSequence...
include("indexing.jl")
include("conversion.jl")
include("predicates.jl")
include("find.jl")
include("printing.jl")
Expand Down
24 changes: 0 additions & 24 deletions src/biosequence/conversion.jl

This file was deleted.

40 changes: 0 additions & 40 deletions src/bit-manipulation/bit-manipulation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,6 @@ end
return count_ones((c | g) & ~(a | t))
end

@inline a_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_00_bitpairs(x)
@inline c_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_01_bitpairs(x)
@inline g_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_10_bitpairs(x)
@inline t_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_11_bitpairs(x)

@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_nonzero_nibbles(a ⊻ b)
end

@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}}
return count_nonzero_bitpairs(a ⊻ b)
end

@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_0000_nibbles(a ⊻ b)
end

@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}}
return count_00_bitpairs(a ⊻ b)
end

@inline function ambiguous_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_nonzero_nibbles(enumerate_nibbles(x) & 0xEEEEEEEEEEEEEEEE)
end
Expand All @@ -67,19 +46,6 @@ end
return count_compared_bytes(i -> (i > 0x15) & (i < 0x1a), x)
end

@inline function ambiguous_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_nonzero_nibbles((enumerate_nibbles(a) | enumerate_nibbles(b)) & 0xEEEEEEEEEEEEEEEE)
end

@inline function gap_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
return count_0000_nibbles(x)
end

@inline function gap_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
# Count the gaps in a, count the gaps in b, subtract the number of shared gaps.
return count_0000_nibbles(a) + count_0000_nibbles(b) - count_0000_nibbles(a | b)
end

@inline function count_compared_bytes(f::F, x::UInt64) where F
z = @inline reinterpret(NTuple{8, VecElement{UInt8}}, x)
@inline sum(map(i -> f(i.value), z))
Expand All @@ -91,12 +57,6 @@ end
return count_0000_nibbles(x)
end

@inline function certain_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}}
x = enumerate_nibbles(a) ⊻ 0x1111111111111111
y = enumerate_nibbles(b) ⊻ 0x1111111111111111
return count_0000_nibbles(x | y)
end

@inline function four_to_two_bits(x::UInt64)::UInt32
m1 = 0x1111111111111111
m2 = m1 << 1
Expand Down
6 changes: 4 additions & 2 deletions src/bit-manipulation/bitindex.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
BitsPerSymbol(::BitIndex{N, W}) where {N,W} = BitsPerSymbol{N}()
bits_per_symbol(::BitIndex{N, W}) where {N,W} = N

@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i) where {N,W}
return BitIndex{N, W}((i - 1) << trailing_zeros(N))
bitindex(B::BitsPerSymbol, ::Type{W}, i::Integer) where W = bitindex(B, W, Int(i)::Int)

Check warning on line 35 in src/bit-manipulation/bitindex.jl

View check run for this annotation

Codecov / codecov/patch

src/bit-manipulation/bitindex.jl#L35

Added line #L35 was not covered by tests

@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i::Union{Int, UInt}) where {N,W}
return BitIndex{N, W}(((i - 1) % UInt) << trailing_zeros(N))
end

bitwidth(::Type{W}) where W = 8*sizeof(W)
Expand Down
11 changes: 4 additions & 7 deletions src/geneticcode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,10 @@

const XNA = Union{DNA, RNA}
function unambiguous_codon(a::XNA, b::XNA, c::XNA)
@inbounds begin
bits = twobitnucs[reinterpret(UInt8, a) + 0x01] << 4 |
twobitnucs[reinterpret(UInt8, b) + 0x01] << 2 |
twobitnucs[reinterpret(UInt8, c) + 0x01]
end
#reinterpret(RNACodon, bits % UInt64)
return bits % UInt64
bits = (trailing_zeros(reinterpret(UInt8, a)) << 4) |
(trailing_zeros(reinterpret(UInt8, b)) << 2) |
(trailing_zeros(reinterpret(UInt8, c)) << 0)
bits % UInt64
end

# A genetic code is a table mapping RNA 3-mers (i.e. RNAKmer{3}) to AminoAcids.
Expand Down
21 changes: 21 additions & 0 deletions src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,27 @@
### This file is a part of BioJulia.
### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md

###
### Promotion
###
for alph in (DNAAlphabet, RNAAlphabet)
@eval function Base.promote_rule(::Type{LongSequence{A}}, ::Type{LongSequence{B}}) where {A<:$alph,B<:$alph}
return LongSequence{promote_rule(A, B)}
end
end

###
### Conversion
###
Base.convert(::Type{T}, seq::T) where {T <: LongSequence} = seq
Base.convert(::Type{T}, seq::T) where {T <: LongSequence{<:NucleicAcidAlphabet}} = seq

function Base.convert(::Type{T}, seq::LongSequence{<:NucleicAcidAlphabet}) where
{T<:LongSequence{<:NucleicAcidAlphabet}}
return T(seq)
end


@inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s))

@inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet
Expand Down
28 changes: 0 additions & 28 deletions src/longsequences/conversion.jl

This file was deleted.

1 change: 0 additions & 1 deletion src/longsequences/longsequence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,6 @@ include("seqview.jl")
include("chunk_iterator.jl")
include("indexing.jl")
include("constructors.jl")
include("conversion.jl")
include("copying.jl")
include("stringliterals.jl")
include("transformations.jl")
Expand Down
11 changes: 8 additions & 3 deletions src/longsequences/operators.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,15 +201,20 @@ function Base.:(==)(seq1::SeqOrView{A}, seq2::SeqOrView{A}) where {A <: Alphabet
length(seq1) == length(seq2) || return false

# If they share the same data
isempty(seq1) && return true
if seq1.data === seq2.data && firstbitindex(seq1) == firstbitindex(seq2)
return true
end

# Fallback
for (i, j) in zip(seq1, seq2)
# Check all filled UInts
(it, (ch1, ch2, rem)) = iter_chunks(seq1, seq2)
for (i, j) in it
i == j || return false
end
return true

# Check last coding UInt (or compare two zeros, if none)
mask = UInt64(1) << (rem & 63) - 1
return (ch1 & mask) == (ch2 & mask)
end

function Base.:(==)(seq1::LongSequence{A}, seq2::LongSequence{A}) where {A <: Alphabet}
Expand Down
11 changes: 7 additions & 4 deletions src/longsequences/transformations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
###

"""
resize!(seq, size, [force::Bool])
resize!(seq, size, [force::Bool=false])

Resize a biological sequence `seq`, to a given `size`. Does not resize the underlying data
array unless the new size does not fit. If `force`, always resize underlying data array.

Note that resizing to a larger size, and then loading from uninitialized positions
is not allowed and may cause undefined behaviour.
Make sure to always fill any uninitialized biosymbols after resizing.
"""
function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A}
if size < 0
Expand Down Expand Up @@ -67,9 +71,8 @@ end
# all chunks by up to 63 bits.
# This is written so it SIMD parallelizes - careful with changes
@inline function zero_offset!(seq::LongSequence{A}) where A <: Alphabet
isempty(seq) && return seq
offs = (64 - offset(bitindex(seq, length(seq)) + bits_per_symbol(A()))) % UInt
zero_offset!(seq, offs)
zero_offset!(seq, offs)
end

@inline function zero_offset!(seq::LongSequence{A}, offs::UInt) where A <: Alphabet
Expand Down Expand Up @@ -165,4 +168,4 @@ function Random.shuffle!(seq::LongSequence)
seq[i], seq[j] = seq[j], seq[i]
end
return seq
end
end
2 changes: 0 additions & 2 deletions test/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ end
for nt in BioSymbols.alphabet(DNA)
@test encode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
end
@test_throws EncodeError encode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000))
end

@testset "RNA" begin
Expand All @@ -142,7 +141,6 @@ end
for nt in BioSymbols.alphabet(RNA)
@test encode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
end
@test_throws EncodeError encode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000))
end

@testset "AminoAcid" begin
Expand Down
Loading