diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index d8ee9086..e4162cea 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -11,7 +11,7 @@ jobs: strategy: fail-fast: false matrix: - julia-version: ['1', '1.10'] + julia-version: ['1', '1.11'] os: [ubuntu-latest, macOS-latest, windows-latest] experimental: [false] include: diff --git a/Project.toml b/Project.toml index 42d23282..81a933a4 100644 --- a/Project.toml +++ b/Project.toml @@ -15,7 +15,7 @@ PrecompileTools = "1" Random = "1.5" StableRNGs = "0.1, 1.0" Twiddle = "1.1.1" -julia = "1.10" +julia = "1.11" [extras] Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index abdd2f94..ecab51c9 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -9,22 +9,22 @@ @inline seq_data_len(s::LongSequence{A}) where A = seq_data_len(A, length(s)) -@inline function seq_data_len(::Type{A}, len::Integer) where A <: Alphabet +@inline function seq_data_len(::Type{A}, len::Integer)::Int where A <: Alphabet iszero(bits_per_symbol(A())) && return 0 - return cld(len, div(64, bits_per_symbol(A()))) + return cld(len % UInt, div(64, bits_per_symbol(A())) % UInt) % Int end function LongSequence{A}(::UndefInitializer, len::Integer) where {A<:Alphabet} if len < 0 throw(ArgumentError("len must be non-negative")) end - return LongSequence{A}(Vector{UInt64}(undef, seq_data_len(A, len)), UInt(len)) + return LongSequence{A}(Memory{UInt64}(undef, seq_data_len(A, len)), UInt(len)) end # Generic constructor function LongSequence{A}(it) where {A <: Alphabet} len = length(it) - data = Vector{UInt64}(undef, seq_data_len(A, len)) + data = Memory{UInt64}(undef, seq_data_len(A, len)) bits = zero(UInt) bitind = bitindex(BitsPerSymbol(A()), encoded_data_eltype(LongSequence{A}), 1) @inbounds for x in it @@ -41,7 +41,7 @@ function LongSequence{A}(it) where {A <: Alphabet} LongSequence{A}(data, len % UInt) end -Base.empty(::Type{T}) where {T <: LongSequence} = T(UInt[], UInt(0)) +Base.empty(::Type{T}) where {T <: LongSequence} = T(Memory{UInt64}(), UInt(0)) (::Type{T})() where {T <: LongSequence} = empty(T) # Constructors from other sequences diff --git a/src/longsequences/copying.jl b/src/longsequences/copying.jl index b98e0360..45e6ebf0 100644 --- a/src/longsequences/copying.jl +++ b/src/longsequences/copying.jl @@ -34,8 +34,12 @@ function Base.copy!(dst::SeqOrView{<:NucleicAcidAlphabet{N}}, end function _copy!(dst::LongSequence, src::LongSequence) - resize!(dst.data, length(src.data)) - copyto!(dst.data, src.data) + src_data_len = seq_data_len(src) + if length(dst.data) ≥ src_data_len + unsafe_copyto!(dst.data, 1, src.data, 1, src_data_len) + else + dst.data = copy(src.data) + end dst.len = src.len return dst end @@ -48,10 +52,11 @@ function _copy!(dst::SeqOrView{A}, src::SeqOrView) where {A <: Alphabet} end if dst.data === src.data longseq = LongSequence{A}(src) - src_ = LongSubSeq{A}(longseq.data, 1:length(longseq)) + src_ = src isa LongSequence ? longseq : LongSubSeq{A}(longseq.data, 1:length(longseq)) else src_ = src end + typeof(src) == typeof(src_) || error() # unreachable return copyto!(dst, 1, src_, 1, length(src)) end diff --git a/src/longsequences/longsequence.jl b/src/longsequences/longsequence.jl index e18f997b..1909cf1a 100644 --- a/src/longsequences/longsequence.jl +++ b/src/longsequences/longsequence.jl @@ -84,10 +84,10 @@ The same applies with `LongSequence{RNAAlphabet{4}}`, simply replace the alphabe parameter with `RNAAlphabet{2}` in order to benefit. """ mutable struct LongSequence{A <: Alphabet} <: BioSequence{A} - const data::Vector{UInt64} # encoded character sequence data + data::Memory{UInt64} # encoded character sequence data len::UInt - function LongSequence{A}(data::Vector{UInt64}, len::UInt) where {A <: Alphabet} + function LongSequence{A}(data::Memory{UInt64}, len::UInt) where {A <: Alphabet} new{A}(data, len) end end diff --git a/src/longsequences/randseq.jl b/src/longsequences/randseq.jl index 8938508b..388181d7 100644 --- a/src/longsequences/randseq.jl +++ b/src/longsequences/randseq.jl @@ -165,8 +165,9 @@ end # the non-ambiguous ones function Random.rand!(rng::AbstractRNG, seq::LongSequence{<:NucleicAcidAlphabet{4}}) data = seq.data - rand!(rng, data) - @inbounds for i in eachindex(data) + len = seq_data_len(seq) + rand!(rng, view(data, 1:len)) + @inbounds for i in 1:len nuc = 0x1111111111111111 mask = data[i] nuc = ((nuc & mask) << 1) | (nuc & ~mask) diff --git a/src/longsequences/seqview.jl b/src/longsequences/seqview.jl index 6f17dc31..b182b8ec 100644 --- a/src/longsequences/seqview.jl +++ b/src/longsequences/seqview.jl @@ -20,11 +20,11 @@ AG ``` """ struct LongSubSeq{A<:Alphabet} <: BioSequence{A} - data::Vector{UInt64} + data::Memory{UInt64} part::UnitRange{Int} # Added to reduce method ambiguities - LongSubSeq{A}(data::Vector{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part) + LongSubSeq{A}(data::Memory{UInt64}, part::UnitRange{Int}) where A = new{A}(data, part) end # These unions are significant because LongSubSeq and LongSequence have the same diff --git a/src/longsequences/transformations.jl b/src/longsequences/transformations.jl index f1e78178..ba16235e 100644 --- a/src/longsequences/transformations.jl +++ b/src/longsequences/transformations.jl @@ -2,6 +2,15 @@ ### LongSequence specific specializations of src/biosequence/transformations.jl ### +@noinline function resize_memory!(seq::LongSequence, n_chunks::UInt) + oldmem = seq.data + newmem = Memory{UInt64}(undef, n_chunks % Int) + unsafe_copyto!(newmem, 1, oldmem, 1, min(seq_data_len(seq), n_chunks)) + seq.data = newmem + seq +end + +# TODO for new breaking version: Do not allow this API, since we can have invalid symbols in encoding? """ resize!(seq, size, [force::Bool]) @@ -9,15 +18,13 @@ Resize a biological sequence `seq`, to a given `size`. Does not resize the under array unless the new size does not fit. If `force`, always resize underlying data array. """ function Base.resize!(seq::LongSequence{A}, size::Integer, force::Bool=false) where {A} - if size < 0 - throw(ArgumentError("size must be non-negative")) - else - if force | (seq_data_len(A, size) > seq_data_len(A, length(seq))) - resize!(seq.data, seq_data_len(A, size)) - end - seq.len = size - return seq + size < 0 && throw(ArgumentError("size must be non-negative")) + usize = UInt(size)::UInt + if force || (seq_data_len(A, usize) > seq_data_len(A, length(seq) % UInt)) + @noinline resize_memory!(seq, seq_data_len(A, usize) % UInt) end + seq.len = size + return seq end """ @@ -92,7 +99,7 @@ end # Reverse chunks in data vector and each symbol within a chunk. Chunks may have nonzero # offset after use, so use zero_offset! -@inline function reverse_data!(pred, data::Vector{UInt64}, len::UInt, B::BT) where { +@inline function reverse_data!(pred, data::Memory{UInt64}, len::UInt, B::BT) where { BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}} @inbounds @simd ivdep for i in 1:len >>> 1 data[i], data[len-i+1] = pred(reversebits(data[len-i+1], B)), pred(reversebits(data[i], B)) @@ -102,7 +109,7 @@ end end end -@inline function reverse_data_copy!(pred, dst::Vector{UInt64}, src::Vector{UInt64}, len::UInt, +@inline function reverse_data_copy!(pred, dst::Memory{UInt64}, src::Memory{UInt64}, len::UInt, B::BT) where {BT <: Union{BitsPerSymbol{2}, BitsPerSymbol{4}, BitsPerSymbol{8}}} @inbounds @simd for i in eachindex(dst) dst[i] = pred(reversebits(src[len - i + 1], B)) @@ -116,7 +123,7 @@ Make a complement sequence of `seq` in place. """ function complement!(seq::LongSequence{A}) where {A<:NucleicAcidAlphabet} seqdata = seq.data - @inbounds for i in eachindex(seqdata) + @inbounds for i in 1:seq_data_len(seq) seqdata[i] = complement_bitpar(seqdata[i], Alphabet(seq)) end return seq