diff --git a/Project.toml b/Project.toml index 42d23282..22a1e74f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BioSequences" uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" authors = ["Sabrina Jaye Ward ", "Jakob Nissen "] -version = "3.2.0" +version = "3.3.0" [deps] BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" diff --git a/docs/src/counting.md b/docs/src/counting.md index 34c9bff2..429188c1 100644 --- a/docs/src/counting.md +++ b/docs/src/counting.md @@ -6,66 +6,73 @@ end ``` # Counting +`BioSequences` contain functionality to efficiently count biosymbols in a biosequence +that satisfies some predicate. + +Consider a naive counting function like this: +```julia +function count_Ns(seq::BioSequence{<:DNAAlphabet}) + ns = 0 + for i in seq + ns += (i == DNA_N)::Bool + end + ns +end +``` -BioSequences extends the `Base.count` method to provide some useful utilities for -counting the number of sites in biological sequences. - -Most generically you can count the number of sites that satisfy some condition -i.e. cause some function to return `true`: +This function can be more efficient implemented by exploiting the internal +data encoding of certain biosequences. +Therefore, Julia provides optimised methods for `Base.count`, such that `count_Ns` +above can be more efficiently expressed `count(==(DNA_N), seq)`. -```jldoctest -julia> count(isambiguous, dna"ATCGM") -1 +!!! note + It is important to understand that this speed is achieved with custom methods of + `Base.count`, and not by a generic mechanism that improves the speed of counting + symbols in `BioSequence `in general. + Hence, while `count(==(DNA_N), seq)` may be optimised, + `count(i -> i == DNA_N, seq)` is not, as this is a different method. -``` +## Currently optimised methods +By default, only `LongSequence` and `LongSubSeq` have optimised methods. Downstream +implementers of new `BioSequence`s may not be optimised. -You can also use two sequences, for example to compute the number of matching -or mismatching symbols: +* `count(isGC, seq)` +* `count(isambiguous, seq)` +* `count(iscertain, seq)` +* `count(isgap, seq)` +* `count(==(biosymbol), seq)` and `count(isequal(biosymbol), seq)` -```jldoctest -julia> count(!=, dna"ATCGM", dna"GCCGM") -2 +## Matches and mismatches +The methods `matches` and `mismatches` take two +sequences and count the number of positions where the sequences are unequal or equal, respectively. -julia> count(==, dna"ATCGM", dna"GCCGM") -3 +They are equivalent to `matches(a, b) = count(splat(==), zip(a, b))` +(and with `!=`, respectively). +```@docs +matches +mismatches ``` -## Alias functions - -A number of functions which are aliases for various invocations of `Base.count` -are provided. - -| Alias function | Base.count call(s) | -| :------------- | :---------------------------------------------------------- | -| `n_ambiguous` | `count(isambiguous, seq)`, `count(isambiguous, seqa, seqb)` | -| `n_certain` | `count(iscertain, seq)`, `count(iscertain, seqa, seqb)` | -| `n_gap` | `count(isgap, seq)`, `count(isgap, seqa, seqb)` | -| `matches` | `count(==, seqa, seqb)` | -| `mismatches` | `count(!=, seqa, seqb)` | +## GC content +The convenience function `gc_content(seq)` is equivalent to `count(isGC, seq) / length(seq)`: -## Bit-parallel optimisations +```@docs +gc_content +``` -For the vast majority of `Base.count(f, seq)` and `Base.count(f, seqa, seqb)` -methods, a naive counting is done: the internal `count_naive` function is called, -which simply loops over each position, applies `f`, and accumulates the result. +## Deprecated aliases +Several of the optimised `count` methods have function names, which are deprecated: -However, for some functions, it is possible to implement highly efficient methods -that use bit-parallelism to check many elements at one time. -This is made possible by the succinct encoding of BioSequences. -Usually `f` is one of the functions provided by BioSymbols.jl or by BioSequences.jl +| Deprecated function | Instead use | +| :------------------- | :------------------------ | +| `n_gaps` | `count(isgap, seq)` | +| `n_certain` | `count(iscertain, seq)` | +| `n_ambiguous` | `count(isambiguous, seq)` | -For such sequence and function combinations, `Base.count(f, seq)` is overloaded -to call an internal `BioSequences.count_*_bitpar` function, which is passed the -sequence(s). If you want to force BioSequences to use naive counting for the -purposes of testing or debugging for example, then you can call -`BioSequences.count_naive` directly. ```@docs -gc_content n_gaps n_certain n_ambiguous -matches -mismatches -``` \ No newline at end of file +``` diff --git a/src/BioSequences.jl b/src/BioSequences.jl index 111d33b1..53c4ba70 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -205,6 +205,7 @@ using Random include("alphabet.jl") # Load the bit-twiddling internals that optimised BioSequences methods depend on. +include("bit-manipulation/bitindex.jl") include("bit-manipulation/bit-manipulation.jl") # The generic, abstract BioSequence type @@ -216,6 +217,8 @@ include("longsequences/longsequence.jl") include("longsequences/hash.jl") include("longsequences/randseq.jl") +include("counting.jl") + include("geneticcode.jl") diff --git a/src/biosequence/biosequence.jl b/src/biosequence/biosequence.jl index f71ea95f..93fd4963 100644 --- a/src/biosequence/biosequence.jl +++ b/src/biosequence/biosequence.jl @@ -235,5 +235,4 @@ include("predicates.jl") include("find.jl") include("printing.jl") include("transformations.jl") -include("counting.jl") include("copying.jl") diff --git a/src/biosequence/counting.jl b/src/biosequence/counting.jl index ab28b88e..e69de29b 100644 --- a/src/biosequence/counting.jl +++ b/src/biosequence/counting.jl @@ -1,164 +0,0 @@ -### -### Counting -### -### Counting operations on biological sequence types. -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -### -### Naive counting -### - -function count_naive(pred, seq::BioSequence) - n = 0 - @inbounds for i in eachindex(seq) - n += pred(seq[i])::Bool - end - return n -end - -function count_naive(pred, seqa::BioSequence, seqb::BioSequence) - n = 0 - @inbounds for i in 1:min(length(seqa), length(seqb)) - n += pred(seqa[i], seqb[i])::Bool - end - return n -end - -""" -Count how many positions in a sequence satisfy a condition (i.e. f(seq[i]) -> true). - -The first argument should be a function which accepts an element of the sequence -as its first parameter, additional arguments may be passed with `args...`. -""" -Base.count(pred, seq::BioSequence) = count_naive(pred, seq) -Base.count(pred, seqa::BioSequence, seqb::BioSequence) = count_naive(pred, seqa, seqb) - -# These functions are BioSequences-specific because they take two arguments -isambiguous_or(x::T, y::T) where {T<:NucleicAcid} = isambiguous(x) | isambiguous(y) -isgap_or(x::T, y::T) where {T<:NucleicAcid} = isgap(x) | isgap(y) -iscertain_and(x::T, y::T) where {T<:NucleicAcid} = iscertain(x) & iscertain(y) - -#BioSymbols.isambiguous(x::T, y::T) where {T<:NucleicAcid} = isambiguous(x) | isambiguous(y) -#BioSymbols.isgap(x::T, y::T) where {T<:NucleicAcid} = isgap(x) | isgap(y) -#BioSymbols.iscertain(x::T, y::T) where {T<:NucleicAcid} = iscertain(x) & iscertain(y) - -Base.count(::typeof(isambiguous), seqa::S, seqb::S) where {S<:BioSequence{<:NucleicAcidAlphabet{2}}} = 0 -Base.count(::typeof(isgap), seqa::S, seqb::S) where {S<:BioSequence{<:NucleicAcidAlphabet{2}}} = 0 -Base.count(::typeof(iscertain), seqa::S, seqb::S) where {S<:BioSequence{<:NucleicAcidAlphabet{2}}} = min(length(seqa), length(seqb)) - -### -### Aliases for various uses of `count`. -### - -""" - gc_content(seq::BioSequence) -> Float64 - -Calculate GC content of `seq`, i.e. the number of symbols that is `DNA_C`, `DNA_G`, -`DNA_C` or `DNA_G` divided by the length of the sequence. - -# Examples -```jldoctest -julia> gc_content(dna"AGCTA") -0.4 - -julia> gc_content(rna"UAGCGA") -0.5 -``` -""" -gc_content(seq::NucleotideSeq) = isempty(seq) ? 0.0 : count(isGC, seq) / length(seq) - -""" - n_ambiguous(a::BioSequence, [b::BioSequence]) -> Int - -Count the number of positions where `a` (or `b`, if present) have ambigious symbols. -If `b` is given, and the length of `a` and `b` differ, look only at the indices -of the shorter sequence. - -# Examples -```jldoctest -julia> n_ambiguous(dna"--TAC-WN-ACY") -3 - -julia> n_ambiguous(rna"UAYWW", rna"UAW") -1 -``` -""" -n_ambiguous(seq::BioSequence) = count(isambiguous, seq) -n_ambiguous(seqa::BioSequence, seqb::BioSequence) = count(isambiguous_or, seqa, seqb) - -""" - n_certain(a::BioSequence, [b::BioSequence]) -> Int - -Count the number of positions where `a` (and `b`, if present) have certain (i.e. non-ambigous -and non-gap) symbols. -If `b` is given, and the length of `a` and `b` differ, look only at the indices -of the shorter sequence. - -# Examples -```jldoctest -julia> n_certain(dna"--TAC-WN-ACY") -5 - -julia> n_certain(rna"UAYWW", rna"UAW") -2 -``` -""" -n_certain(seq::BioSequence) = count(iscertain, seq) -n_certain(seqa::BioSequence, seqb::BioSequence) = count(iscertain_and, seqa, seqb) - -""" - n_gaps(a::BioSequence, [b::BioSequence]) -> Int - -Count the number of positions where `a` (or `b`, if present) have gaps. -If `b` is given, and the length of `a` and `b` differ, look only at the indices -of the shorter sequence. - -# Examples -```jldoctest -julia> n_gaps(dna"--TAC-WN-ACY") -4 - -julia> n_gaps(dna"TC-AC-", dna"-CACG") -2 -``` -""" -n_gaps(seq::BioSequence) = count(isgap, seq) -n_gaps(seqa::BioSequence, seqb::BioSequence) = count(isgap_or, seqa, seqb) - -""" - mismatches(a::BioSequence, b::BioSequences) -> Int - -Count the number of positions in where `a` and `b` differ. -If `b` is given, and the length of `a` and `b` differ, look only at the indices -of the shorter sequence. - -# Examples -```jldoctest -julia> mismatches(dna"TAGCTA", dna"TACCTA") -1 - -julia> mismatches(dna"AACA", dna"AAG") -1 -``` -""" -mismatches(seqa::BioSequence, seqb::BioSequence) = count(!=, seqa, seqb) - -""" - mismatches(a::BioSequence, b::BioSequences) -> Int - -Count the number of positions in where `a` and `b` are equal. -If `b` is given, and the length of `a` and `b` differ, look only at the indices -of the shorter sequence. - -# Examples -```jldoctest -julia> matches(dna"TAGCTA", dna"TACCTA") -5 - -julia> matches(dna"AACA", dna"AAG") -2 -``` -""" -matches(seqa::BioSequence, seqb::BioSequence) = count(==, seqa, seqb) \ No newline at end of file diff --git a/src/bit-manipulation/bit-manipulation.jl b/src/bit-manipulation/bit-manipulation.jl index cb447287..12a24009 100644 --- a/src/bit-manipulation/bit-manipulation.jl +++ b/src/bit-manipulation/bit-manipulation.jl @@ -1,7 +1,3 @@ - -include("bitindex.jl") -include("bitpar-compiler.jl") - @inline function reversebits(x::T, ::BitsPerSymbol{2}) where T <: Base.BitUnsigned mask = 0x33333333333333333333333333333333 % T x = ((x >> 2) & mask) | ((x & mask) << 2) @@ -67,6 +63,10 @@ end return count_nonzero_nibbles(enumerate_nibbles(x) & 0xEEEEEEEEEEEEEEEE) end +@inline function ambiguous_bitcount(x::UInt64, ::AminoAcidAlphabet) + return count_compared_bytes(i -> (i > 0x15) & (i < 0x1a), x) +end + @inline function ambiguous_bitcount(a::UInt64, b::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} return count_nonzero_nibbles((enumerate_nibbles(a) | enumerate_nibbles(b)) & 0xEEEEEEEEEEEEEEEE) end @@ -80,6 +80,11 @@ end return count_0000_nibbles(a) + count_0000_nibbles(b) - count_0000_nibbles(a | b) end +@inline function count_compared_bytes(f::F, x::UInt64) where F + z = @inline reinterpret(NTuple{8, VecElement{UInt8}}, x) + @inline sum(map(i -> f(i.value), z)) +end + @inline function certain_bitcount(x::UInt64, ::T) where {T<:NucleicAcidAlphabet{4}} x = enumerate_nibbles(x) x = x ⊻ 0x1111111111111111 diff --git a/src/bit-manipulation/bitpar-compiler.jl b/src/bit-manipulation/bitpar-compiler.jl deleted file mode 100644 index ca078e09..00000000 --- a/src/bit-manipulation/bitpar-compiler.jl +++ /dev/null @@ -1,249 +0,0 @@ -### -### Bitparallel -### - -""" -A generator for efficient bitwise sequence operations. -""" -function compile_bitpar(funcname::Symbol; - arguments::Tuple = (), - init_code::Expr = :(), - head_code::Expr = :(), - body_code::Expr = :(), - tail_code::Expr = :(), - return_code::Expr = :()) - functioncode = :(function $(funcname)() end) - for arg in arguments - push!(functioncode.args[1].args, arg) - end - functioncode.args[2] = quote - $(init_code) - ind = bitindex(seq, 1) - stop = bitindex(seq, lastindex(seq) + 1) - data = seq.data - @inbounds begin - if !iszero(offset(ind)) & (ind < stop) - # align the bit index to the beginning of a block boundary - o = offset(ind) - mask = bitmask(stop - ind) - n_bits_masked = ifelse(index(stop) == index(ind), count_zeros(mask), o) - chunk = (data[index(ind)] >> o) & mask - $(head_code) - ind += 64 - o - end - - lastind = index(stop - bits_per_symbol(Alphabet(seq))) - lastind -= !iszero(offset(stop)) - for i in index(ind):lastind - chunk = data[i] - $(body_code) - ind += 64 - end - - if ind < stop - n_bits_masked = 64 - offset(stop) - chunk = data[index(ind)] & bitmask(64 - n_bits_masked) - $(tail_code) - end - end - $(return_code) - end - return functioncode -end - -function get_arg_name(arg) - if typeof(arg) === Expr - if arg.head === :(::) - return first(arg.args) - end - end - return arg -end - -function check_arguments(args::Tuple) - # Check all arguments are symbols or expressions - for arg in args - if !(isa(arg, Symbol) || isa(arg, Expr)) - error("Argument ", arg, " is not an expression or symbol") - end - end - - a1 = first(args) - if isa(a1, Symbol) - @assert a1 == :seqa - elseif isa(a1, Expr) - @assert a1.head === :(::) - @assert first(a1.args) === :seqa - end - - a2 = args[2] - if isa(a2, Symbol) - @assert a2 == :seqb - elseif isa(a2, Expr) - @assert a2.head === :(::) - @assert first(a2.args) === :seqb - end -end - -function add_arguments!(exp, args) - check_arguments(args) - @assert exp.head === :function - if exp.args[1].head === :call - for arg in args - push!(exp.args[1].args, arg) - end - elseif exp.args[1].head === :where - @assert exp.args[1].args[1].head === :call - for arg in args - push!(exp.args[1].args[1].args, arg) - end - else - error("Expression in function is not a :call or :where!") - end -end - -function compile_2seq_bitpar(funcname::Symbol; - arguments::Tuple = (), - parameters::Tuple = (), - init_code::Expr = :(), - head_code::Expr = :(), - body_code::Expr = :(), - tail_code::Expr = :(), - return_code::Expr = :()) - - # TODO: Check the parameters provided. - - if isempty(parameters) - functioncode = :(function $(funcname)() end) - else - functioncode = :(function $(funcname)() where {$(parameters...)} end) - end - - add_arguments!(functioncode, arguments) - - argument_names = map(get_arg_name, arguments) - argument_names = tuple(argument_names[2], argument_names[1], argument_names[3:end]...) - - functioncode.args[2] = quote - if length(seqa) > length(seqb) - return $funcname($(argument_names...)) - end - @assert length(seqa) ≤ length(seqb) - - nexta = bitindex(seqa, 1) - stopa = bitindex(seqa, (lastindex(seqa) + 1) % UInt) - nextb = bitindex(seqb, 1) - stopb = bitindex(seqb, (lastindex(seqb) + 1) % UInt) - adata = seqa.data - bdata = seqb.data - - $(init_code) - - # The first thing we need to sort out is to correctly align the head of - # sequence / subsequence `a`s data is aligned such that the offset of - # `nexta` is essentially reduced to 0. - # With sequence / subsequence `a` aligned, from there, we only need to - # worry about the alignment of sequence / subsequence `b` with respect - # to `a`. - if nexta < stopa && offset(nexta) != 0 - # Here we shift the first data chunks to the right so as the first - # nucleotide of the seq/subseq is the first nibble / pair of bits. - x = @inbounds adata[index(nexta)] >> offset(nexta) - y = @inbounds bdata[index(nextb)] >> offset(nextb) - # Here it was assumed that there is something to go and get from - # the next chunk of `b`, yet that may not be true. - # We know that if this is not true of `b`, then it is certainly not - # true of `a`. - # We check if the end of the sequence is not contained in the same - # integer like so: `64 - offset(nextb) < stopb - nextb`. - # - # This edge case was found and accounted for by Ben J. Ward @BenJWard. - # Ask this maintainer for more information. - if offset(nextb) > offset(nexta) && 64 - offset(nextb) < stopb - nextb - y |= @inbounds bdata[index(nextb) + 1] << (64 - offset(nextb)) - end - # Here we need to check something, we need to check if the - # integer of `a` we are currently aligning contains the end of - # seq/subseq `a`. Because if so it's something we need to take into - # account of when we mask x and y. - # - # In other words if `64 - offset(nexta) > stopa - nexta, we know - # seq or subseq a's data ends before the end of this data chunk, - # and so the mask used needs to be defined to account for this: - # `mask(stopa - nexta)`, otherwise the mask simply needs to be - # `mask(64 - offset(nexta))`. - # - # This edge case was found and accounted for by Ben Ward @Ward9250. - # Ask this maintainer for more information. - offs = ifelse(64 - offset(nexta) > stopa - nexta, stopa - nexta, 64 - offset(nexta)) - m = bitmask(offs) - - x &= m - y &= m - - $(head_code) - - # Here we move our current position markers by k, meaning they move - # to either, A). The next integer, or B). The end of the sequence if - # it is in the current integer. - nexta += offs - nextb += offs - end - - if offset(nextb) == 0 # data are aligned with each other - while stopa - nexta ≥ 64 # Iterate through body of data - x = @inbounds adata[index(nexta)] - y = @inbounds bdata[index(nextb)] - - $(body_code) - - nexta += 64 - nextb += 64 - end - - if nexta < stopa - x = @inbounds adata[index(nexta)] - y = @inbounds bdata[index(nextb)] - - offs = stopa - nexta - m = bitmask(offs) - x &= m - y &= m - - $(tail_code) - end - elseif nexta < stopa # Data are unaligned - y = bdata[index(nextb)] - nextb += 64 - # Note that here, updating `nextb` by 64, increases the chunk index, - # but the `offset(nextb)` will remain the same. - while stopa - nexta ≥ 64 # processing body of data - x = @inbounds adata[index(nexta)] - z = @inbounds bdata[index(nextb)] - y = y >> offset(nextb) | z << (64 - offset(nextb)) - - $(body_code) - - y = z - nexta += 64 - nextb += 64 - end - - if nexta < stopa # processing tail of data - x = @inbounds adata[index(nexta)] - y = y >> offset(nextb) - if 64 - offset(nextb) < stopa - nexta - y |= bdata[index(nextb)] << (64 - offset(nextb)) - end - offs = stopa - nexta - m = bitmask(offs) - x &= m - y &= m - - $(tail_code) - end - end - $(return_code) - end - return functioncode -end diff --git a/src/counting.jl b/src/counting.jl new file mode 100644 index 00000000..efd3fb18 --- /dev/null +++ b/src/counting.jl @@ -0,0 +1,434 @@ +# TODO: SubSeqChunks may be unnecessary for counting, since we don't care about +# the order of the chunks. +# Instead, we could emit the head and tail chunk seperately, then an iterator of +# all the full chunks loaded as single elements from the underlying vector. + +const KNOWN_ALPHABETS = Union{DNAAlphabet, RNAAlphabet, AminoAcidAlphabet} + +trunc_seq(x::LongSequence, len::Int) = typeof(x)(x.data, len % UInt) +trunc_seq(x::LongSubSeq, len::Int) = typeof(x)(x.data, first(x.part):first(x.part)+len-1) + +@inline function counter_2seq(tail::Function, body::Function, a::SeqOrView, b::SeqOrView) + # TODO: Remove these first lines at the API change + minlength = min(length(a), length(b)) + (a, b) = (trunc_seq(a, minlength), trunc_seq(b, minlength)) + (it, (ch1, ch2, rm)) = @inline iter_chunks(a, b) + y = @inline tail(ch1, ch2, rm) + for (a, b) in it + y += @inline body(a, b) + end + y +end + +@inline function counter_1seq(tail::Function, body::Function, seq::SeqOrView) + (it, (chunk, rm)) = @inline iter_chunks(seq) + y = @inline tail(chunk, rm) + for i in it + y += @inline body(i) + end + y +end + +""" + gc_content(seq::BioSequence) -> Float64 + +Calculate GC content of `seq`, i.e. the number of symbols that is `DNA_C`, `DNA_G`, +`DNA_C` or `DNA_G` divided by the length of the sequence. + +# Examples +```jldoctest +julia> gc_content(dna"AGCTA") +0.4 + +julia> gc_content(rna"UAGCGA") +0.5 +``` +""" +gc_content(seq::NucleotideSeq) = _n_gc(seq) / length(seq) +Base.count(::typeof(isGC), seq::NucleotideSeq) = _n_gc(seq) + +# Aliases +""" + matches(a::BioSequence, b::BioSequences) -> Int + +Count the number of positions in where `a` and `b` are equal. +If `b` is given, and the length of `a` and `b` differ, look only at the indices +of the shorter sequence. +This function does not provide any special handling of ambiguous symbols, +so e.g. `DNA_A` does not match `DNA_N`. + +!!! warning + Passing in two sequences with differing lengths is deprecated. In a future, + breaking release of BioSequences, this will error. + +# Examples +```jldoctest +julia> matches(dna"TAWNNA", dna"TACCTA") +3 + +julia> matches(dna"AACA", dna"AAG") +2 +``` +""" +function matches end + +@deprecate( + Base.count(::typeof(!=), a::BioSequence, b::BioSequence), + count(splat(!=), zip(a, b)), + false +) + +""" + mismatches(a::BioSequence, b::BioSequences) -> Int + +Count the number of positions in where `a` and `b` differ. +If `b` is given, and the length of `a` and `b` differ, look only at the indices +of the shorter sequence. +This function does not provide any special handling of ambiguous symbols, +so e.g. `DNA_A` does not match `DNA_N`. + +!!! warning + Passing in two sequences with differing lengths is deprecated. In a future, + breaking release of BioSequences, this will error. + +# Examples +```jldoctest +julia> mismatches(dna"TAGCTA", dna"TACNTA") +2 + +julia> mismatches(dna"AACA", dna"AAG") +1 +``` +""" +function mismatches end + +@deprecate( + Base.count(::typeof(==), a::BioSequence, b::BioSequence), + count(splat(==), zip(a, b)), + false +) + +""" + n_gaps(a::BioSequence, [b::BioSequence]) -> Int + +Count the number of positions where `a` (or `b`, if present) have gaps. +If `b` is given, and the length of `a` and `b` differ, look only at the indices +of the shorter sequence. + +!!! warning + Passing in two sequences is deprecated. In a future, breaking release of + BioSequences, this will throw a `MethodError` + +# Examples +```jldoctest +julia> n_gaps(dna"--TAC-WN-ACY") +4 + +julia> n_gaps(dna"TC-AC-", dna"-CACG") +2 +``` +""" +function n_gaps end + +Base.count(::typeof(isgap), seq::BioSequence) = _n_gaps(seq) + +@deprecate( + Base.count(::typeof(isgap), a::BioSequence, b::BioSequence), + count(((i,j),) -> isgap(i) | isgap(j), zip(a, b)), + false +) + +""" + n_ambiguous(a::BioSequence, [b::BioSequence]) -> Int + +Count the number of positions where `a` (or `b`, if present) have ambigious symbols. +If `b` is given, and the length of `a` and `b` differ, look only at the indices +of the shorter sequence. +Gaps are not ambigous. + +!!! warning + Passing in two sequences is deprecated. In a future, breaking release of + BioSequences, this will throw a `MethodError` + + +# Examples +```jldoctest +julia> n_ambiguous(dna"--TAC-WN-ACY") +3 + +julia> n_ambiguous(rna"UAYWW", rna"UAW") +1 +``` +""" +function n_ambiguous end + +Base.count(::typeof(isambiguous), seq::BioSequence) = _n_ambiguous(seq) + +@deprecate( + Base.count(::typeof(isambiguous), a::BioSequence, b::BioSequence), + count(((i,j),) -> isambiguous(i) | isambiguous(j), zip(a, b)), + false +) + +""" + n_certain(a::BioSequence, [b::BioSequence]) -> Int + +Count the number of positions where `a` (and `b`, if present) have certain (i.e. non-ambigous +and non-gap) symbols. +If `b` is given, and the length of `a` and `b` differ, look only at the indices +of the shorter sequence. +Gaps are not certain. + +!!! warning + Passing in two sequences is deprecated. In a future, breaking release of + BioSequences, this will throw a `MethodError` + + +# Examples +```jldoctest +julia> n_certain(dna"--TAC-WN-ACY") +5 + +julia> n_certain(rna"UAYWW", rna"UAW") +2 +``` +""" +function n_certain end + +Base.count(::typeof(iscertain), seq::BioSequence) = _n_certain(seq) + +@deprecate( + Base.count(::typeof(iscertain), a::BioSequence, b::BioSequence), + count(((i,j),) -> iscertain(i) & iscertain(j), zip(a, b)), + false +) + +# Fallback implementation +# (Urgh, we really shouldn't do these kinds of overloads of Base) +function count_naive(pred, seqa::BioSequence, seqb::BioSequence) + n = 0 + for (i, j) in zip(seqa, seqb) + n += pred(i, j)::Bool + end + return n +end + +function count_naive(pred, seq::BioSequence) + n = 0 + for i in seq + n += pred(i)::Bool + end + return n +end + +@deprecate( + Base.count(f::Function, a::BioSequence, b::BioSequence), + count(splat(f), a, b), + false +) + +@noinline function depwarn_lengths() + Base.depwarn( + "Calling `mismatches` with sequences of differing lengths is deprecated, + and will throw an exception in a future release of BioSequences.", + :mismatches + ) +end + +function mismatches_inner(a::BioSequence, b::BioSequence) + length(a) == length(b) || depwarn_lengths() + count_naive(!=, a, b) +end + +function _isdisjoint(a::Alphabet, b::Alphabet) + for i in a, j in b + i == j && return false + end + true +end + +Base.@assume_effects :foldable function Base.isdisjoint(a::KNOWN_ALPHABETS, b::KNOWN_ALPHABETS) + _isdisjoint(a, b) +end + +Base.isdisjoint(a::Alphabet, b::Alphabet) = _isdisjoint(a, b) + +# For the known alphabets (vast majojrity of cases), we can tell statically +# if the alphabets don't overlap, e.g. DNAAlphabet{2} and RNAAlphabet{4}. +# In this case, the result will always be zero. +function mismatches(a::BioSequence{A}, b::BioSequence{B}) where {A, B} + isdisjoint(A(), B()) ? min(length(a), length(b)) : mismatches_inner(a, b) +end + +_n_ambiguous(seq::BioSequence) = count_naive(isambiguous, seq) + +_n_certain(seq::BioSequence) = count_naive(iscertain, seq) + +_n_gc(seq::BioSequence) = count_naive(isGC, seq) + +const FourBit = SeqOrView{<:NucleicAcidAlphabet{4}} +const TwoBit = SeqOrView{<:NucleicAcidAlphabet{2}} + +const TwoBitSeq = BioSequence{<:NucleicAcidAlphabet{2}} +const FourBitSeq = BioSequence{<:NucleicAcidAlphabet{4}} + +# Trivial overloads +_n_certain(s::TwoBitSeq) = length(s) +_n_ambiguous(s::TwoBitSeq) = 0 +_n_gaps(s::TwoBitSeq) = 0 + +function matches(a::BioSequence, b::BioSequence) + min(length(a), length(b)) - mismatches(a, b) +end + +_n_gaps(s::BioSequence) = count_symbol(s, gap(eltype(Alphabet(s)))) + +# Other overloads +function mismatches_inner(a::SeqOrView{A}, b::SeqOrView{A}) where {A <: NucleicAcidAlphabet{2}} + length(a) == length(b) || depwarn_lengths() + tail = (ch1, ch2, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + count_nonzero_bitpairs((ch1 & mask) ⊻ (ch2 & mask)) + end + body = (i, j) -> count_nonzero_bitpairs(i ⊻ j) + counter_2seq(tail, body, a, b) +end + +function mismatches_inner(a::SeqOrView{A}, b::SeqOrView{A}) where {A <: NucleicAcidAlphabet{4}} + length(a) == length(b) || depwarn_lengths() + tail = (ch1, ch2, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + count_nonzero_nibbles((ch1 & mask) ⊻ (ch2 & mask)) + end + body = (i, j) -> count_nonzero_nibbles(i ⊻ j) + counter_2seq(tail, body, a, b) +end + +function _n_gc(seq::Union{TwoBit, FourBit}) + tail = (chunk, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + gc_bitcount(chunk & mask, Alphabet(seq)) + end + body = i -> gc_bitcount(i, Alphabet(seq)) + counter_1seq(tail, body, seq) +end + +function _n_ambiguous(seq::FourBit) + tail = (chunk, rm) -> begin + mask = (UInt64(1) << (rm & 63) - 1) + ambiguous_bitcount(chunk & mask, Alphabet(seq)) + end + body = i -> ambiguous_bitcount(i, Alphabet(seq)) + counter_1seq(tail, body, seq) +end + +function _n_certain(seq::FourBit) + tail = (chunk, rm) -> begin + mask = (UInt64(1) << (rm & 63) - 1) + certain_bitcount(chunk & mask, Alphabet(seq)) + end + body = i -> certain_bitcount(i, Alphabet(seq)) + counter_1seq(tail, body, seq) +end + +function mismatches_inner(a::SeqOrView{AminoAcidAlphabet}, b::SeqOrView{AminoAcidAlphabet}) + length(a) == length(b) || depwarn_lengths() + tail = (ch1, ch2, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + count_compared_bytes(!iszero, (ch1 & mask) ⊻ (ch2 & mask)) + end + body = (i, j) -> count_compared_bytes(!iszero, i ⊻ j) + counter_2seq(tail, body, a, b) +end + +function _n_ambiguous(seq::SeqOrView{AminoAcidAlphabet}) + tail = (chunk, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + ambiguous_bitcount(chunk & mask, AminoAcidAlphabet()) + end + body = i -> ambiguous_bitcount(i, AminoAcidAlphabet()) + counter_1seq(tail, body, seq) +end + +function _n_certain(seq::SeqOrView{AminoAcidAlphabet}) + tail = (chunk, rm) -> begin + mask = ~(UInt64(1) << (rm & 63) - 1) + count_compared_bytes(<(0x16), chunk | mask) + end + body = i -> count_compared_bytes(<(0x16), i) + counter_1seq(tail, body, seq) +end + +function count_symbol(seq::BioSequence, sym::BioSymbol) + # We encode here in order to throw an exception + encode(Alphabet(seq), sym) + n = 0 + for i in seq + n += (i == sym)::Bool + end + n +end + +function Base.count( + pred::Base.Fix2{<:Union{typeof(==), typeof(isequal)}, <: BioSymbol}, + s::BioSequence +) + count_symbol(s, pred.x) +end + +function count_symbol(seq::FourBit, s::Union{RNA, DNA}) + pattern = encode(Alphabet(seq), s)::UInt64 * 0x1111111111111111 + tail = (chunk, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + masked = iszero(pattern) ? chunk | ~mask : mask & chunk + count_0000_nibbles(masked ⊻ pattern) + end + body = i -> count_0000_nibbles(i ⊻ pattern) + counter_1seq(tail, body, seq) +end + +function count_symbol(seq::TwoBit, s::Union{RNA, DNA}) + pattern = encode(Alphabet(seq), s)::UInt64 * 0x5555555555555555 + tail = (chunk, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + masked = iszero(pattern) ? chunk | ~mask : mask & chunk + count_00_bitpairs(masked ⊻ pattern) + end + body = i -> count_00_bitpairs(i ⊻ pattern) + counter_1seq(tail, body, seq) +end + +function count_symbol(seq::SeqOrView{AminoAcidAlphabet}, s::AminoAcid) + byte = encode(Alphabet(seq), s) % UInt8 + tail = (chunk, rm) -> begin + mask = UInt64(1) << (rm & 63) - 1 + masked = iszero(byte) ? chunk | ~mask : mask & chunk + count_compared_bytes(==(byte), masked) + end + body = i -> count_compared_bytes(==(byte), i) + counter_1seq(tail, body, seq) +end + +## Deprecate weird two-arg methods +for (sym, op, joiner) in [ + (:n_ambiguous, :isambiguous, :|), + (:n_gaps, :isgap, :|), + (:n_certain, :iscertain, :&), +] + for (A, B) in [ + (:BioSequence, :BioSequence), + (:FourBit, :FourBit), + (:FourBit, :TwoBit), + (:TwoBit, :FourBit), + (:TwoBit, :TwoBit), + ] + @eval(@deprecate( + $(sym)(a::$(A), b::$(B)), + count(((i,j),) -> $(joiner)($(op)(i), $(op)(j)), zip(a, b)), + )) + end + for T in [:BioSequence, TwoBit, FourBit, :(SeqOrView{<:AminoAcidAlphabet})] + @eval(@deprecate($(sym)(a::$(T)), count($(op), a))) + end +end + diff --git a/src/longsequences/chunk_iterator.jl b/src/longsequences/chunk_iterator.jl new file mode 100644 index 00000000..88ae0313 --- /dev/null +++ b/src/longsequences/chunk_iterator.jl @@ -0,0 +1,132 @@ +struct LongSeqChunks + # TODO: Switch to Memory when possible + # That also necessitates a first_index field + v::Vector{UInt64} + last_index::UInt +end + +first_state(::LongSeqChunks) = UInt(1) + +@inline function Base.iterate(it::LongSeqChunks, state::UInt=UInt(1)) + state > it.last_index && return nothing + iter_inbounds(it, state) +end + +@inline function iter_inbounds(it::LongSeqChunks, state::UInt) + (@inbounds(it.v[state]), state + 1) +end + +Base.eltype(::Type{LongSeqChunks}) = UInt64 +Base.length(it::LongSeqChunks) = it.last_index % Int + +function iter_chunks(seq::LongSequence)::Tuple{LongSeqChunks, Tuple{UInt64, UInt8}} + v = seq.data + seqlen = length(seq) + n_bits = (seqlen * bits_per_symbol(Alphabet(seq))) % UInt + remaining_bits = (n_bits % UInt(64)) % UInt8 + lst = index(lastbitindex(seq)) + len = !iszero(seqlen) * (lst - !iszero(remaining_bits)) + start = if iszero(remaining_bits) + (UInt64(0), 0x00) + else + (@inbounds(v[lst]), remaining_bits) + end + ( + LongSeqChunks(v, len % UInt), + start + ) +end + +struct SubSeqChunks + # TODO: Switch to Memory where possible + v::Vector{UInt64} + # First index to load from + first_index::UInt + # Last index to load from (except index + 1, if offset if not zero) + last_index::UInt + offset::UInt8 +end + +@inline function first_state(x::SubSeqChunks) + x.first_index > x.last_index && return (x.last_index + 1, zero(UInt64)) + remaining = if iszero(x.offset) + zero(UInt64) + else + @inbounds(x.v[x.first_index - 1]) >> (x.offset & 0x3f) + end + (x.first_index, remaining) +end + +Base.eltype(::Type{SubSeqChunks}) = UInt64 +Base.length(it::SubSeqChunks) = (it.last_index - it.first_index) % Int + 1 + +@inline function Base.iterate(it::SubSeqChunks, state::Tuple{UInt, UInt64}=first_state(it)) + first(state) > it.last_index && return nothing + iter_inbounds(it, state) +end + +@inline function iter_inbounds(it::SubSeqChunks, state::Tuple{UInt, UInt64}) + (index, remaining_bits) = state + new_bits = @inbounds(it.v[index]) + result = (new_bits << ((0x40 - it.offset) & 0x3f)) | remaining_bits + next_remaining = iszero(it.offset) ? zero(UInt64) : new_bits >> (it.offset & 0x3f) + (result, (index + 1, next_remaining)) +end + + +function iter_chunks(seq::LongSubSeq)::Tuple{SubSeqChunks, Tuple{UInt64, UInt8}} + len = div(length(seq) % UInt, symbols_per_data_element(seq)) + bps = bits_per_symbol(Alphabet(seq)) + fbi = firstbitindex(seq) + off = offset(fbi) % UInt8 + first_index = index(fbi) + !iszero(off) + last_index = first_index + len - 1 + n_bits = (length(seq) * bps) % UInt + remaining_bits = (n_bits % UInt(64)) % UInt8 + remaining_elems = div(remaining_bits, bps % UInt) + v = seq.data + start = if iszero(remaining_bits) + (UInt64(0), 0x00) + else + lbi = lastbitindex(seq) + bits_in_last = offset(lbi) + bps + rsh = offset(bitindex(seq, lastindex(seq) - remaining_elems + 1)) + chunk = if bits_in_last < remaining_bits + chunk = @inbounds(v[index(lbi) - 1]) >> (rsh & 63) + chunk | @inbounds(v[index(lbi)]) << ((64 - rsh) & 63) + else + @inbounds(v[index(lbi)]) >> (rsh & 63) + end + (chunk, remaining_bits) + end + it = SubSeqChunks(v, first_index % UInt, last_index % UInt, off) + (it, start) +end + +struct PairedChunkIterator{A, B} + a::A + b::B +end + +function iter_chunks(a::BioSequence, b::BioSequence) + length(a) == length(b) || throw("Sequences must have same length") + (ita, (cha, rema)) = iter_chunks(a) + (itb, (chb, _)) = iter_chunks(b) + ( + PairedChunkIterator{typeof(ita), typeof(itb)}(ita, itb), + (cha, chb, rema), + ) +end + +Base.length(it::PairedChunkIterator) = length(it.a) +Base.eltype(::Type{<:PairedChunkIterator}) = NTuple{2, UInt64} + +@inline function Base.iterate( + it::PairedChunkIterator, + state=(first_state(it.a), first_state(it.b)) +) + a = iterate(it.a, first(state)) + isnothing(a) && return nothing + b = iter_inbounds(it.b, last(state)) + ((first(a), first(b)), (last(a), last(b))) +end diff --git a/src/longsequences/counting.jl b/src/longsequences/counting.jl deleted file mode 100644 index d784113a..00000000 --- a/src/longsequences/counting.jl +++ /dev/null @@ -1,167 +0,0 @@ -### -### LongSequence specific specializations of src/biosequence/counting.jl -### -### This file is a part of BioJulia. -### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md - -# Counting GC positions -let - counter = :(n += gc_bitcount(chunk, Alphabet(seq))) - compile_bitpar( - :count_gc_bitpar, - arguments = (:(seq::SeqOrView{<:NucleicAcidAlphabet}),), - init_code = :(n = 0), - head_code = counter, - body_code = counter, - tail_code = counter, - return_code = :(return n % Int) - ) |> eval -end - -Base.count(::typeof(isGC), seq::SeqOrView{<:NucleicAcidAlphabet}) = count_gc_bitpar(seq) - -# Counting mismatches -let - counter = :(count += mismatch_bitcount(x, y, A())) - - compile_2seq_bitpar( - :count_mismatches_bitpar, - arguments = (:(seqa::SeqOrView{A}), :(seqb::SeqOrView{A})), - parameters = (:(A<:NucleicAcidAlphabet),), - init_code = :(count = 0), - head_code = counter, - body_code = counter, - tail_code = counter, - return_code = :(return count % Int) - ) |> eval -end -Base.count(::typeof(!=), seqa::SeqOrView{A}, seqb::SeqOrView{A}) where {A<:NucleicAcidAlphabet} = count_mismatches_bitpar(seqa, seqb) -Base.count(::typeof(!=), seqa::NucleicSeqOrView, seqb::NucleicSeqOrView) = count(!=, promote(seqa, seqb)...) - -# Counting matches -let - counter = :(count += match_bitcount(x, y, A())) - - count_empty = quote - count += match_bitcount(x, y, A()) - nempty = div(64, bits_per_symbol(A())) - div(Int(offs), bits_per_symbol(A())) - count -= nempty - end - - compile_2seq_bitpar( - :count_matches_bitpar, - arguments = (:(seqa::SeqOrView{A}), :(seqb::SeqOrView{A})), - parameters = (:(A<:NucleicAcidAlphabet),), - init_code = :(count = 0), - head_code = count_empty, - body_code = counter, - tail_code = count_empty, - return_code = :(return count % Int) - ) |> eval -end -Base.count(::typeof(==), seqa::SeqOrView{A}, seqb::SeqOrView{A}) where {A<:NucleicAcidAlphabet} = count_matches_bitpar(seqa, seqb) -Base.count(::typeof(==), seqa::NucleicSeqOrView, seqb::NucleicSeqOrView) = count(==, promote(seqa, seqb)...) - -# Counting ambiguous sites -# ------------------------ -let - counter = :(count += ambiguous_bitcount(chunk, Alphabet(seq))) - - compile_bitpar( - :count_ambiguous_bitpar, - arguments = (:(seq::SeqOrView{<:NucleicAcidAlphabet}),), - init_code = :(count = 0), - head_code = counter, - body_code = counter, - tail_code = counter, - return_code = :(return count % Int) - ) |> eval - - counter = :(count += ambiguous_bitcount(x, y, A())) - - compile_2seq_bitpar( - :count_ambiguous_bitpar, - arguments = (:(seqa::SeqOrView{A}), :(seqb::SeqOrView{A})), - parameters = (:(A<:NucleicAcidAlphabet),), - init_code = :(count = 0), - head_code = counter, - body_code = counter, - tail_code = counter, - return_code = :(return count % Int) - ) |> eval -end - - -## For a single sequence. -# You can never have ambiguous bases in a 2-bit encoded nucleotide sequence. -Base.count(::typeof(isambiguous), seq::SeqOrView{<:NucleicAcidAlphabet{2}}) = 0 -Base.count(::typeof(isambiguous), seq::SeqOrView{<:NucleicAcidAlphabet{4}}) = count_ambiguous_bitpar(seq) - -## For a pair of sequences. -# A pair of 2-bit encoded sequences will never have ambiguous bases. -Base.count(::typeof(isambiguous), seqa::SeqOrView{A}, seqb::SeqOrView{A}) where {A<:NucleicAcidAlphabet{2}} = 0 -Base.count(::typeof(isambiguous), seqa::SeqOrView{A}, seqb::SeqOrView{A}) where {A<:NucleicAcidAlphabet{4}} = count_ambiguous_bitpar(seqa, seqb) -Base.count(::typeof(isambiguous), seqa::SeqOrView{<:NucleicAcidAlphabet{4}}, seqb::SeqOrView{<:NucleicAcidAlphabet{2}}) = count(isambiguous_or, promote(seqa, seqb)...) -Base.count(::typeof(isambiguous), seqa::SeqOrView{<:NucleicAcidAlphabet{2}}, seqb::SeqOrView{<:NucleicAcidAlphabet{4}}) = count(isambiguous_or, promote(seqa, seqb)...) - -# Counting certain sites -let - counter = :(count += certain_bitcount(x, y, A())) - - compile_2seq_bitpar( - :count_certain_bitpar, - arguments = (:(seqa::SeqOrView{A}), :(seqb::SeqOrView{A})), - parameters = (:(A<:NucleicAcidAlphabet),), - init_code = :(count = 0), - head_code = counter, - body_code = counter, - tail_code = counter, - return_code = :(return count % Int) - ) |> eval -end -Base.count(::typeof(iscertain), seqa::SeqOrView{A}, seqb::SeqOrView{A}) where {A<:NucleicAcidAlphabet{4}} = count_certain_bitpar(seqa, seqb) -Base.count(::typeof(iscertain), seqa::SeqOrView{<:NucleicAcidAlphabet{4}}, seqb::SeqOrView{<:NucleicAcidAlphabet{2}}) = count(iscertain_and, promote(seqa, seqb)...) -Base.count(::typeof(iscertain), seqa::SeqOrView{<:NucleicAcidAlphabet{2}}, seqb::SeqOrView{<:NucleicAcidAlphabet{4}}) = count(iscertain_and, promote(seqa, seqb)...) - -# Counting gap sites -let - count_empty = quote - Alph = Alphabet(seq) - count += gap_bitcount(chunk, Alph) - count -= div(n_bits_masked, bits_per_symbol(Alph)) - end - counter = :(count += gap_bitcount(chunk, Alphabet(seq))) - - compile_bitpar( - :count_gap_bitpar, - arguments = (:(seq::SeqOrView{<:NucleicAcidAlphabet{4}}),), - init_code = :(count = 0), - head_code = count_empty, - body_code = counter, - tail_code = count_empty, - return_code = :(return count % Int) - ) |> eval - - count_empty = quote - Alph = Alphabet(seqa) - count += gap_bitcount(x, y, Alph) - nempty = div(64, bits_per_symbol(Alph)) - div(offs, bits_per_symbol(Alph)) - count -= nempty - end - counter = :(count += gap_bitcount(x, y, A())) - - compile_2seq_bitpar( - :count_gap_bitpar, - arguments = (:(seqa::SeqOrView{A}), :(seqb::SeqOrView{A})), - parameters = (:(A<:NucleicAcidAlphabet),), - init_code = :(count = 0), - head_code = count_empty, - body_code = counter, - tail_code = count_empty, - return_code = :(return count % Int) - ) |> eval -end -Base.count(::typeof(isgap), seqa::SeqOrView{A}, seqb::SeqOrView{A}) where {A<:NucleicAcidAlphabet{4}} = count_gap_bitpar(seqa, seqb) -Base.count(::typeof(isgap), seqa::SeqOrView{A}) where {A<:NucleicAcidAlphabet{4}} = count_gap_bitpar(seqa) -Base.count(::typeof(isgap), seqa::SeqOrView{<:NucleicAcidAlphabet{4}}, seqb::SeqOrView{<:NucleicAcidAlphabet{2}}) = count(isgap_or, promote(seqa, seqb)...) -Base.count(::typeof(isgap), seqa::SeqOrView{<:NucleicAcidAlphabet{2}}, seqb::SeqOrView{<:NucleicAcidAlphabet{4}}) = count(isgap_or, promote(seqa, seqb)...) diff --git a/src/longsequences/indexing.jl b/src/longsequences/indexing.jl index e4c59a2b..183c559c 100644 --- a/src/longsequences/indexing.jl +++ b/src/longsequences/indexing.jl @@ -10,8 +10,8 @@ bitindex(N, encoded_data_eltype(typeof(x)), i) end -firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s)) -lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s)) +firstbitindex(s::SeqOrView) = bitindex(s, firstindex(s) % UInt) +lastbitindex(s::SeqOrView) = bitindex(s, lastindex(s) % UInt) @inline function extract_encoded_element(x::SeqOrView, i::Integer) bi = bitindex(x, i % UInt) diff --git a/src/longsequences/longsequence.jl b/src/longsequences/longsequence.jl index e18f997b..5b0d9fcc 100644 --- a/src/longsequences/longsequence.jl +++ b/src/longsequences/longsequence.jl @@ -113,6 +113,7 @@ Base.copy(x::LongSequence) = typeof(x)(copy(x.data), x.len) symbols_per_data_element(x::LongSequence) = div(64, bits_per_symbol(Alphabet(x))) include("seqview.jl") +include("chunk_iterator.jl") include("indexing.jl") include("constructors.jl") include("conversion.jl") @@ -120,4 +121,3 @@ include("copying.jl") include("stringliterals.jl") include("transformations.jl") include("operators.jl") -include("counting.jl") diff --git a/src/longsequences/operators.jl b/src/longsequences/operators.jl index e8516b47..f1b5d05e 100644 --- a/src/longsequences/operators.jl +++ b/src/longsequences/operators.jl @@ -198,7 +198,7 @@ end ### Comparisons function Base.:(==)(seq1::SeqOrView{A}, seq2::SeqOrView{A}) where {A <: Alphabet} - length(seq1) == length(seq2) || false + length(seq1) == length(seq2) || return false # If they share the same data if seq1.data === seq2.data && firstbitindex(seq1) == firstbitindex(seq2) diff --git a/test/biosequences/misc.jl b/test/biosequences/misc.jl index 78225e6a..38ba5684 100644 --- a/test/biosequences/misc.jl +++ b/test/biosequences/misc.jl @@ -17,9 +17,9 @@ end @test count(x -> x in (RNA_A, RNA_C), seq) == count(x -> x in "AC", str) @test isapprox(gc_content(seq), count(x -> x in "GC", str) / length(seq)) - @test n_ambiguous(seq) == 0 - @test n_certain(seq) == length(seq) - @test n_gaps(seq) == 0 + @test count(isambiguous, seq) == 0 + @test count(iscertain, seq) == length(seq) + @test count(isgap, seq) == 0 end end diff --git a/test/counting.jl b/test/counting.jl index 9d61cebd..d7dd26f8 100644 --- a/test/counting.jl +++ b/test/counting.jl @@ -1,143 +1,110 @@ -@testset "Counting" begin +# Collection of BioSequence, LongSequence w. differing alphabets, and SubSequence +bio_seqs = BioSequence[ + SimpleSeq("UGAUCUGUAGU"), + SimpleSeq("") +] - @testset "GC content" begin - @test gc_content(dna"") === 0.0 - @test gc_content(dna"AATA") === 0.0 - @test gc_content(dna"ACGT") === 0.5 - @test gc_content(dna"CGGC") === 1.0 - @test gc_content(dna"ACATTGTGTATAACAAAAGG") === 6 / 20 - @test gc_content(dna"GAGGCGTTTATCATC"[2:end]) === 6 / 14 - - @test gc_content(rna"") === 0.0 - @test gc_content(rna"AAUA") === 0.0 - @test gc_content(rna"ACGU") === 0.5 - @test gc_content(rna"CGGC") === 1.0 - @test gc_content(rna"ACAUUGUGUAUAACAAAAGG") === 6 / 20 - @test gc_content(rna"GAGGCGUUUAUCAUC"[2:end]) === 6 / 14 - - @test_throws Exception gc_content(aa"ARN") - - Random.seed!(1234) - for _ in 1:200 - s = randdnaseq(rand(1:100)) - @test gc_content(s) === count(isGC, s) / length(s) - @test gc_content(LongSequence{DNAAlphabet{2}}(s)) === count(isGC, s) / length(s) - i = rand(1:lastindex(s)) - j = rand(i-1:lastindex(s)) - @test gc_content(s[i:j]) === (j < i ? 0.0 : count(isGC, s[i:j]) / (j - i + 1)) - end +long_nucs = [ + dna"", + dna"TAGAGGAC", + dna"WSCGCTYCG", + dna"ATGC-CAACC", + dna"TAGCGA--TCCGAA-TCGAAGC", + dna"TCG-GTWYCYGYA-KKKAKCK--KNAWSSSTAGTCYKMNNACYWS", + rna"", + rna"UAGUGCUGAG", + rna"WSUACGYKMNAC", + rna"UGAUCGUAGUAGUCGUCGUGUC", + rna"UBBSSDMHGVBRHA-YNKYDKAWSDYWC-VDCDUKCY" +] + +strip_type(::Type{<:DNAAlphabet}) = DNAAlphabet +strip_type(::Type{<:RNAAlphabet}) = RNAAlphabet + +for i in 1:lastindex(long_nucs) + sq = long_nucs[i] + if all(iscertain, sq) + push!(long_nucs, LongSequence{strip_type(typeof(Alphabet(sq))){2}}(sq)) end - - function testcounter( - pred::Function, - alias::Function, - seqa::BioSequence, - seqb::BioSequence, - singlearg::Bool, - multi_alias::Function - ) - # Test that order does not matter. - @test count(pred, seqa, seqb) == count(pred, seqb, seqa) - @test BioSequences.count_naive(multi_alias, seqa, seqb) == BioSequences.count_naive(multi_alias, seqb, seqa) - @test alias(seqa, seqb) == alias(seqb, seqa) - # Test that result is the same as counting naively. - @test count(pred, seqa, seqb) == BioSequences.count_naive(multi_alias, seqa, seqb) - @test count(pred, seqb, seqa) == BioSequences.count_naive(multi_alias, seqb, seqa) - # Test that the alias function works. - @test count(pred, seqa, seqb) == alias(seqa, seqb) - @test count(pred, seqb, seqa) == alias(seqb, seqa) - if singlearg - @test count(pred, seqa) == count(pred, (i for i in seqa)) - @test BioSequences.count_naive(pred, seqa) == BioSequences.count(pred, seqa) - end +end + +long_aa = [ + aa"", + aa"ATYCISL", + aa"-PYCOSL", + aa"LDJFDODIVLJK", + aa"OIDHEOU-JNFDEDWQ", + aa"OIDFENU-JDFDWDWQA", + aa"WQKPC--LELIFEJ--", +] + +sub_nucs = [ + view(dna"", 1:0), + view(rna"UAGUC", 1:0), + view(LongDNA{2}("TAGTCGTAGGCTGCGTGATGATGATAGTCGTGTCGTAGTA"), 2:38), + view(dna"TAGTGCTATK-GAMTTWGTWSGTA-GCCCTAS-GAGTCTTA-W", 19:42), + view(dna"TCG-GTWYCYGYA-KKKAKCK--KNAWSSSTAGTCYKMNNACYWS", 4:45) +] + +for seq in long_nucs + for span in [3:19, 7:31, 1:length(seq)] + last(span) > length(seq) && continue + push!(sub_nucs, view(seq, span)) end - - function counter_random_tests( - pred::Function, - alias::Function, - alphx::Type{<:Alphabet}, - alphy::Type{<:Alphabet}, - subset::Bool, - singlearg::Bool, - multi_alias::Function - ) - for _ in 1:10 - seqA = random_seq(alphx, rand(10:100)) - seqB = random_seq(alphy, rand(10:100)) - sa = seqA - sb = seqB - if subset - intA = random_interval(1, length(seqA)) - intB = random_interval(1, length(seqB)) - subA = view(seqA, intA) - subB = view(seqB, intB) - sa = subA - sb = subB - end - testcounter(pred, alias, sa, sb, singlearg, multi_alias) - end +end + +all_nucs = append!(copy(bio_seqs), long_nucs, sub_nucs) +all_seqs = append!(BioSequence[], all_nucs, long_aa) + +@testset failfast=true "Counting" begin + # GC content + for i in all_nucs + @test gc_content(i) === sum(isGC, i; init=0) / length(i) end - - @testset "Mismatches" begin - for a in (DNAAlphabet, RNAAlphabet) - # Can't promote views - for sub in (true, false) - for n in (4, 2) - counter_random_tests(!=, mismatches, a{n}, a{n}, sub, false, !=) - end - end - counter_random_tests(!=, mismatches, a{4}, a{2}, false, false, !=) - counter_random_tests(!=, mismatches, a{2}, a{4}, false, false, !=) + + # Matches / mismatches + for i in 1:length(all_seqs)-1, j in i+1:length(all_seqs) + a = all_seqs[i] + b = all_seqs[j] + + # We do this gymnastics because we want to throw depwarns when seeing + # sequences of differing lengths, and we can't throw depwarns during test. + (a, b) = length(a) < length(b) ? (a, b) : (b, a) + b = if length(b) == length(a) + b + elseif b isa SimpleSeq + # Can't take view of a SimpleSeq + continue + else + view(b, 1:length(a)) end + @test mismatches(a, b) == sum(splat(!=), zip(a, b); init=0) + @test matches(a, b) == sum(splat(==), zip(a, b); init=0) end - - @testset "Matches" begin - for a in (DNAAlphabet, RNAAlphabet) - for sub in (true, false) - for n in (4, 2) - counter_random_tests(==, matches, a{n}, a{n}, sub, false, ==) - end - end - counter_random_tests(==, matches, a{4}, a{2}, false, false, ==) - counter_random_tests(==, matches, a{2}, a{4}, false, false, ==) - end + + # Gaps + for i in all_seqs + @test count(isgap, i) == sum(isgap, i; init=0) end - - @testset "Ambiguous" begin - for a in (DNAAlphabet, RNAAlphabet) - # Can't promote views - for n in (4, 2) - for sub in (true, false) - counter_random_tests(isambiguous, n_ambiguous, a{n}, a{n}, sub, false, BioSequences.isambiguous_or) - end - end - counter_random_tests(isambiguous, n_ambiguous, a{4}, a{2}, false, true, BioSequences.isambiguous_or) - counter_random_tests(isambiguous, n_ambiguous, a{2}, a{4}, false, true, BioSequences.isambiguous_or) - end + + # Ambiguous + for i in all_seqs + @test count(isambiguous, i) == sum(isambiguous, i; init=0) end - - @testset "Certain" begin - for a in (DNAAlphabet, RNAAlphabet) - for n in (4, 2) - for sub in (true, false) - counter_random_tests(iscertain, n_certain, a{n}, a{n}, sub, true, BioSequences.iscertain_and) - end - end - counter_random_tests(iscertain, n_certain, a{4}, a{2}, false, true, BioSequences.iscertain_and) - counter_random_tests(iscertain, n_certain, a{2}, a{4}, false, true, BioSequences.iscertain_and) - end + + # Certain + for i in all_seqs + @test count(iscertain, i) == sum(iscertain, i; init=0) end - - @testset "Gap" begin - for a in (DNAAlphabet, RNAAlphabet) - for n in (4, 2) - for sub in (true, false) - counter_random_tests(isgap, n_gaps, a{n}, a{n}, sub, true, BioSequences.isgap_or) - end - end - counter_random_tests(isgap, n_gaps, a{4}, a{2}, false, true, BioSequences.isgap_or) - counter_random_tests(isgap, n_gaps, a{2}, a{4}, false, true, BioSequences.isgap_or) + + # Count symbol + for seq in all_seqs + for symbol in Alphabet(seq) + @test count(==(symbol), seq) == sum(i == symbol for i in seq; init=0) end end - + @test_throws EncodeError count(==(DNA_A), rna"UAG") + @test_throws EncodeError count(==(DNA_M), LongDNA{2}(dna"TAG")) + @test_throws EncodeError count(==(AA_C), dna"TAG") + @test_throws EncodeError count(==(RNA_U), dna"TAG") end