diff --git a/src/alphabet.jl b/src/alphabet.jl index 790a822f..d14c4804 100644 --- a/src/alphabet.jl +++ b/src/alphabet.jl @@ -171,24 +171,17 @@ BitsPerSymbol(::A) where A <: NucleicAcidAlphabet{4} = BitsPerSymbol{4}() ## Encoding and decoding DNA and RNA alphabet symbols - -# A nucleotide with bitvalue B has kmer-bitvalue kmerbits[B+1]. -# ambiguous nucleotides have no kmervalue, here set to 0xff -const twobitnucs = (0xff, 0x00, 0x01, 0xff, - 0x02, 0xff, 0xff, 0xff, - 0x03, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff) - for A in (DNAAlphabet, RNAAlphabet) T = eltype(A) @eval begin # 2-bit encoding @inline function encode(::$(A){2}, nt::$(T)) - if count_ones(nt) != 1 || !isvalid(nt) + u = reinterpret(UInt8, nt) + if count_ones(u) != 1 throw(EncodeError($(A){2}(), nt)) end - return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01]) + trailing_zeros(u) % UInt end @inline function decode(::$(A){2}, x::UInt) @@ -199,9 +192,6 @@ for A in (DNAAlphabet, RNAAlphabet) # 4-bit encoding @inline function encode(::$(A){4}, nt::$(T)) - if !isvalid(nt) - throw(EncodeError($(A){4}(), nt)) - end return convert(UInt, reinterpret(UInt8, nt)) end diff --git a/src/biosequence/biosequence.jl b/src/biosequence/biosequence.jl index 93fd4963..ba1fa375 100644 --- a/src/biosequence/biosequence.jl +++ b/src/biosequence/biosequence.jl @@ -41,6 +41,20 @@ For compatibility with existing `Alphabet`s, the encoded data eltype must be `UI """ abstract type BioSequence{A<:Alphabet} end +function (::Type{S})(seq::BioSequence) where {S <: AbstractString} + _string(S, seq, codetype(Alphabet(seq))) +end + +Base.LazyString(seq::BioSequence) = LazyString(string(seq)) + +function _string(::Type{S}, seq::BioSequence, ::AlphabetCode) where {S<:AbstractString} + return S([Char(x) for x in seq]) +end + +function _string(::Type{String}, seq::BioSequence, ::AsciiAlphabet) + String([stringbyte(s) for s in seq]) +end + """ has_interface(::Type{BioSequence}, ::T, syms::Vector, mutable::Bool, compat::Bool=true) @@ -230,7 +244,6 @@ const AASeq = BioSequence{AminoAcidAlphabet} # The generic functions for any BioSequence... include("indexing.jl") -include("conversion.jl") include("predicates.jl") include("find.jl") include("printing.jl") diff --git a/src/biosequence/conversion.jl b/src/biosequence/conversion.jl deleted file mode 100644 index 7b4a3386..00000000 --- a/src/biosequence/conversion.jl +++ /dev/null @@ -1,24 +0,0 @@ -### -### Conversion & Promotion -### -### -### Conversion methods for biological sequences. -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -function (::Type{S})(seq::BioSequence) where {S <: AbstractString} - _string(S, seq, codetype(Alphabet(seq))) -end - -@static if VERSION >= v"1.8" - Base.LazyString(seq::BioSequence) = LazyString(string(seq)) -end - -function _string(::Type{S}, seq::BioSequence, ::AlphabetCode) where {S<:AbstractString} - return S([Char(x) for x in seq]) -end - -function _string(::Type{String}, seq::BioSequence, ::AsciiAlphabet) - String([stringbyte(s) for s in seq]) -end diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl index e40c6e1e..ca3ae7d2 100644 --- a/src/bit-manipulation/bit-manipulation.jl +++ b/src/bit-manipulation/bit-manipulation.jl @@ -38,27 +38,6 @@ end return count_ones((c | g) & ~(a | t)) end -@inline a_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_00_bitpairs(x) -@inline c_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_01_bitpairs(x) -@inline g_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_10_bitpairs(x) -@inline t_bitcount(x::Unsigned, ::NucleicAcidAlphabet{2}) = count_11_bitpairs(x) - -@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} - return count_nonzero_nibbles(a ⊻ b) -end - -@inline function mismatch_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}} - return count_nonzero_bitpairs(a ⊻ b) -end - -@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} - return count_0000_nibbles(a ⊻ b) -end - -@inline function match_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{2}} - return count_00_bitpairs(a ⊻ b) -end - @inline function ambiguous_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} return count_nonzero_nibbles(enumerate_nibbles(x) & 0xEEEEEEEEEEEEEEEE) end @@ -67,19 +46,6 @@ end return count_compared_bytes(i -> (i > 0x15) & (i < 0x1a), x) end -@inline function ambiguous_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} - return count_nonzero_nibbles((enumerate_nibbles(a) | enumerate_nibbles(b)) & 0xEEEEEEEEEEEEEEEE) -end - -@inline function gap_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} - return count_0000_nibbles(x) -end - -@inline function gap_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} - # Count the gaps in a, count the gaps in b, subtract the number of shared gaps. - return count_0000_nibbles(a) + count_0000_nibbles(b) - count_0000_nibbles(a | b) -end - @inline function count_compared_bytes(f::F, x::UInt64) where F z = @inline reinterpret(NTuple{8, VecElement{UInt8}}, x) @inline sum(map(i -> f(i.value), z)) @@ -91,12 +57,6 @@ end return count_0000_nibbles(x) end -@inline function certain_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} - x = enumerate_nibbles(a) ⊻ 0x1111111111111111 - y = enumerate_nibbles(b) ⊻ 0x1111111111111111 - return count_0000_nibbles(x | y) -end - @inline function four_to_two_bits(x::UInt64)::UInt32 m1 = 0x1111111111111111 m2 = m1 << 1 diff --git a/src/bit-manipulation/bitindex.jl b/src/bit-manipulation/bitindex.jl index 0b8cfe31..406de72b 100644 --- a/src/bit-manipulation/bitindex.jl +++ b/src/bit-manipulation/bitindex.jl @@ -32,8 +32,10 @@ end BitsPerSymbol(::BitIndex{N, W}) where {N,W} = BitsPerSymbol{N}() bits_per_symbol(::BitIndex{N, W}) where {N,W} = N -@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i) where {N,W} - return BitIndex{N, W}((i - 1) << trailing_zeros(N)) +bitindex(B::BitsPerSymbol, ::Type{W}, i::Integer) where W = bitindex(B, W, Int(i)::Int) + +@inline function bitindex(::BitsPerSymbol{N}, ::Type{W}, i::Union{Int, UInt}) where {N,W} + return BitIndex{N, W}(((i - 1) % UInt) << trailing_zeros(N)) end bitwidth(::Type{W}) where W = 8*sizeof(W) diff --git a/src/geneticcode.jl b/src/geneticcode.jl index 444db4b1..20333126 100644 --- a/src/geneticcode.jl +++ b/src/geneticcode.jl @@ -9,13 +9,10 @@ const XNA = Union{DNA, RNA} function unambiguous_codon(a::XNA, b::XNA, c::XNA) - @inbounds begin - bits = twobitnucs[reinterpret(UInt8, a) + 0x01] << 4 | - twobitnucs[reinterpret(UInt8, b) + 0x01] << 2 | - twobitnucs[reinterpret(UInt8, c) + 0x01] - end - #reinterpret(RNACodon, bits % UInt64) - return bits % UInt64 + bits = (trailing_zeros(reinterpret(UInt8, a)) << 4) | + (trailing_zeros(reinterpret(UInt8, b)) << 2) | + (trailing_zeros(reinterpret(UInt8, c)) << 0) + bits % UInt64 end # A genetic code is a table mapping RNA 3-mers (i.e. RNAKmer{3}) to AminoAcids. diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index f3cbde9e..2054ce6f 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -7,6 +7,27 @@ ### This file is a part of BioJulia. ### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md +### +### Promotion +### +for alph in (DNAAlphabet, RNAAlphabet) + @eval function Base.promote_rule(::Type{LongSequence{A}}, ::Type{LongSequence{B}}) where {A<:$alph,B<:$alph} + return LongSequence{promote_rule(A, B)} + end +end + +### +### Conversion +### +Base.convert(::Type{T}, seq::T) where {T <: LongSequence} = seq +Base.convert(::Type{T}, seq::T) where {T <: LongSequence{<:NucleicAcidAlphabet}} = seq + +function Base.convert(::Type{T}, seq::LongSequence{<:NucleicAcidAlphabet}) where + {T<:LongSequence{<:NucleicAcidAlphabet}} + return T(seq) +end + + @inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s)) @inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet diff --git a/src/longsequences/conversion.jl b/src/longsequences/conversion.jl deleted file mode 100644 index 34c80e7c..00000000 --- a/src/longsequences/conversion.jl +++ /dev/null @@ -1,28 +0,0 @@ -### -### Conversion & Promotion -### -### -### Conversion methods for LongSequences. -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -### -### Promotion -### -for alph in (DNAAlphabet, RNAAlphabet) - @eval function Base.promote_rule(::Type{LongSequence{A}}, ::Type{LongSequence{B}}) where {A<:$alph,B<:$alph} - return LongSequence{promote_rule(A, B)} - end -end - -### -### Conversion -### -Base.convert(::Type{T}, seq::T) where {T <: LongSequence} = seq -Base.convert(::Type{T}, seq::T) where {T <: LongSequence{<:NucleicAcidAlphabet}} = seq - -function Base.convert(::Type{T}, seq::LongSequence{<:NucleicAcidAlphabet}) where - {T<:LongSequence{<:NucleicAcidAlphabet}} - return T(seq) -end diff --git a/src/longsequences/longsequence.jl b/src/longsequences/longsequence.jl index 5b0d9fcc..fa4377bc 100644 --- a/src/longsequences/longsequence.jl +++ b/src/longsequences/longsequence.jl @@ -116,7 +116,6 @@ include("seqview.jl") include("chunk_iterator.jl") include("indexing.jl") include("constructors.jl") -include("conversion.jl") include("copying.jl") include("stringliterals.jl") include("transformations.jl") diff --git a/src/longsequences/operators.jl b/src/longsequences/operators.jl index f1b5d05e..e53da895 100644 --- a/src/longsequences/operators.jl +++ b/src/longsequences/operators.jl @@ -201,15 +201,20 @@ function Base.:(==)(seq1::SeqOrView{A}, seq2::SeqOrView{A}) where {A <: Alphabet length(seq1) == length(seq2) || return false # If they share the same data + isempty(seq1) && return true if seq1.data === seq2.data && firstbitindex(seq1) == firstbitindex(seq2) return true end - # Fallback - for (i, j) in zip(seq1, seq2) + # Check all filled UInts + (it, (ch1, ch2, rem)) = iter_chunks(seq1, seq2) + for (i, j) in it i == j || return false end - return true + + # Check last coding UInt (or compare two zeros, if none) + mask = UInt64(1) << (rem & 63) - 1 + return (ch1 & mask) == (ch2 & mask) end function Base.:(==)(seq1::LongSequence{A}, seq2::LongSequence{A}) where {A <: Alphabet} diff --git a/src/longsequences/transformations.jl b/src/longsequences/transformations.jl index f1e78178..e2ede26d 100644 --- a/src/longsequences/transformations.jl +++ b/src/longsequences/transformations.jl @@ -3,10 +3,14 @@ ### """ - resize!(seq, size, [force::Bool]) + resize!(seq, size, [force::Bool=false]) Resize a biological sequence `seq`, to a given `size`. Does not resize the underlying data array unless the new size does not fit. If `force`, always resize underlying data array. + +Note that resizing to a larger size, and then loading from uninitialized positions +is not allowed and may cause undefined behaviour. +Make sure to always fill any uninitialized biosymbols after resizing. """ function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A} if size < 0 @@ -67,9 +71,8 @@ end # all chunks by up to 63 bits. # This is written so it SIMD parallelizes - careful with changes @inline function zero_offset!(seq::LongSequence{A}) where A <: Alphabet - isempty(seq) && return seq offs = (64 - offset(bitindex(seq, length(seq)) + bits_per_symbol(A()))) % UInt - zero_offset!(seq, offs) + zero_offset!(seq, offs) end @inline function zero_offset!(seq::LongSequence{A}, offs::UInt) where A <: Alphabet @@ -165,4 +168,4 @@ function Random.shuffle!(seq::LongSequence) seq[i], seq[j] = seq[j], seq[i] end return seq -end \ No newline at end of file +end diff --git a/test/alphabet.jl b/test/alphabet.jl index 8fa24a21..cb27fa0d 100644 --- a/test/alphabet.jl +++ b/test/alphabet.jl @@ -125,7 +125,6 @@ end for nt in BioSymbols.alphabet(DNA) @test encode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt)) end - @test_throws EncodeError encode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000)) end @testset "RNA" begin @@ -142,7 +141,6 @@ end for nt in BioSymbols.alphabet(RNA) @test encode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt)) end - @test_throws EncodeError encode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000)) end @testset "AminoAcid" begin