Document bitpar counting (#313)

Also add some inbounds annotations. These were exported, but had no explicit docstrings. Future work could include cleaning up the bitpar compiler to emit more efficient code, and removing the weird methods.
BioJulia · Oct 18, 2024 · c8e348e · c8e348e · jakobnissen · Oct 18, 2024
1 parent 3fbd383
commit c8e348e
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 18 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioSequences"
 uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
-version = "3.1.6"
+version = "3.2.0"
 
 [deps]
 BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"

diff --git a/docs/src/counting.md b/docs/src/counting.md
@@ -59,4 +59,13 @@ For such sequence and function combinations, `Base.count(f, seq)` is overloaded
 to call an internal `BioSequences.count_*_bitpar` function, which is passed the
 sequence(s). If you want to force BioSequences to use naive counting for the
 purposes of testing or debugging for example, then you can call
-`BioSequences.count_naive` directly.
+`BioSequences.count_naive` directly.
+
+```@docs
+gc_content
+n_gaps
+n_certain
+n_ambiguous
+matches
+mismatches
+```
diff --git a/src/biosequence/counting.jl b/src/biosequence/counting.jl
@@ -53,20 +53,112 @@ Base.count(::typeof(iscertain), seqa::S, seqb::S) where {S<:BioSequence{<:Nuclei
 ###
 
 """
-    gc_content(seq::BioSequence)
+    gc_content(seq::BioSequence) -> Float64
 
-Calculate GC content of `seq`.
+Calculate GC content of `seq`, i.e. the number of symbols that is `DNA_C`, `DNA_G`,
+`DNA_C` or `DNA_G` divided by the length of the sequence.
+
+# Examples
+```jldoctest
+julia> gc_content(dna"AGCTA")
+0.4
+
+julia> gc_content(rna"UAGCGA")
+0.5
+```
 """
 gc_content(seq::NucleotideSeq) = isempty(seq) ? 0.0 : count(isGC, seq) / length(seq)
 
-n_ambiguous(seq) = count(isambiguous, seq)
+"""
+    n_ambiguous(a::BioSequence, [b::BioSequence]) -> Int
+
+Count the number of positions where `a` (or `b`, if present) have ambigious symbols.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> n_ambiguous(dna"--TAC-WN-ACY")
+3
+
+julia> n_ambiguous(rna"UAYWW", rna"UAW")
+1
+```
+"""
+n_ambiguous(seq::BioSequence) = count(isambiguous, seq)
 n_ambiguous(seqa::BioSequence, seqb::BioSequence) = count(isambiguous_or, seqa, seqb)
 
-n_certain(seq) = count(iscertain, seq)
+"""
+    n_certain(a::BioSequence, [b::BioSequence]) -> Int
+
+Count the number of positions where `a` (and `b`, if present) have certain (i.e. non-ambigous
+and non-gap) symbols.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> n_certain(dna"--TAC-WN-ACY")
+5
+
+julia> n_certain(rna"UAYWW", rna"UAW")
+2
+```
+"""
+n_certain(seq::BioSequence) = count(iscertain, seq)
 n_certain(seqa::BioSequence, seqb::BioSequence) = count(iscertain_and, seqa, seqb)
 
+"""
+    n_gaps(a::BioSequence, [b::BioSequence]) -> Int
+
+Count the number of positions where `a` (or `b`, if present) have gaps.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> n_gaps(dna"--TAC-WN-ACY")
+4
+
+julia> n_gaps(dna"TC-AC-", dna"-CACG")
+2
+```
+"""
 n_gaps(seq::BioSequence) = count(isgap, seq)
 n_gaps(seqa::BioSequence, seqb::BioSequence) = count(isgap_or, seqa, seqb)
 
+"""
+    mismatches(a::BioSequence, b::BioSequences) -> Int
+
+Count the number of positions in where `a` and `b` differ.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> mismatches(dna"TAGCTA", dna"TACCTA")
+1
+
+julia> mismatches(dna"AACA", dna"AAG")
+1
+```
+"""
 mismatches(seqa::BioSequence, seqb::BioSequence) = count(!=, seqa, seqb)
+
+"""
+    mismatches(a::BioSequence, b::BioSequences) -> Int
+
+Count the number of positions in where `a` and `b` are equal.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> matches(dna"TAGCTA", dna"TACCTA")
+5
+
+julia> matches(dna"AACA", dna"AAG")
+2
+```
+"""
 matches(seqa::BioSequence, seqb::BioSequence) = count(==, seqa, seqb)
diff --git a/src/bit-manipulation/bitpar-compiler.jl b/src/bit-manipulation/bitpar-compiler.jl
@@ -131,9 +131,9 @@ function compile_2seq_bitpar(funcname::Symbol;
         @assert length(seqa) ≤ length(seqb)
 
         nexta = bitindex(seqa, 1)
-        stopa = bitindex(seqa, lastindex(seqa) + 1)
+        stopa = bitindex(seqa, (lastindex(seqa) + 1) % UInt)
         nextb = bitindex(seqb, 1)
-        stopb = bitindex(seqb, lastindex(seqb) + 1)
+        stopb = bitindex(seqb, (lastindex(seqb) + 1) % UInt)
         adata = seqa.data
         bdata = seqb.data
 
@@ -148,8 +148,8 @@ function compile_2seq_bitpar(funcname::Symbol;
         if nexta < stopa && offset(nexta) != 0
             # Here we shift the first data chunks to the right so as the first
             # nucleotide of the seq/subseq is the first nibble / pair of bits.
-            x = adata[index(nexta)] >> offset(nexta)
-            y = bdata[index(nextb)] >> offset(nextb)
+            x = @inbounds adata[index(nexta)] >> offset(nexta)
+            y = @inbounds bdata[index(nextb)] >> offset(nextb)
             # Here it was assumed that there is something to go and get from
             # the next chunk of `b`, yet that may not be true.
             # We know that if this is not true of `b`, then it is certainly not
@@ -160,7 +160,7 @@ function compile_2seq_bitpar(funcname::Symbol;
             # This edge case was found and accounted for by Ben J. Ward @BenJWard.
             # Ask this maintainer for more information.
             if offset(nextb) > offset(nexta) && 64 - offset(nextb) < stopb - nextb
-                y |= bdata[index(nextb) + 1] << (64 - offset(nextb))
+                y |= @inbounds bdata[index(nextb) + 1] << (64 - offset(nextb))
             end
             # Here we need to check something, we need to check if the
             # integer of `a` we are currently aligning contains the end of
@@ -192,8 +192,8 @@ function compile_2seq_bitpar(funcname::Symbol;
 
         if offset(nextb) == 0  # data are aligned with each other
             while stopa - nexta ≥ 64 # Iterate through body of data
-                x = adata[index(nexta)]
-                y = bdata[index(nextb)]
+                x = @inbounds adata[index(nexta)]
+                y = @inbounds bdata[index(nextb)]
 
                 $(body_code)
 
@@ -202,8 +202,8 @@ function compile_2seq_bitpar(funcname::Symbol;
             end
 
             if nexta < stopa
-                x = adata[index(nexta)]
-                y = bdata[index(nextb)]
+                x = @inbounds adata[index(nexta)]
+                y = @inbounds bdata[index(nextb)]
 
                 offs = stopa - nexta
                 m = bitmask(offs)
@@ -218,8 +218,8 @@ function compile_2seq_bitpar(funcname::Symbol;
             # Note that here, updating `nextb` by 64, increases the chunk index,
             # but the `offset(nextb)` will remain the same.
             while stopa - nexta ≥ 64 # processing body of data
-                x = adata[index(nexta)]
-                z = bdata[index(nextb)]
+                x = @inbounds adata[index(nexta)]
+                z = @inbounds bdata[index(nextb)]
                 y = y >> offset(nextb) | z << (64 - offset(nextb))
 
                 $(body_code)
@@ -230,7 +230,7 @@ function compile_2seq_bitpar(funcname::Symbol;
             end
 
             if nexta < stopa # processing tail of data
-                x = adata[index(nexta)]
+                x = @inbounds adata[index(nexta)]
                 y = y >> offset(nextb)
                 if 64 - offset(nextb) < stopa - nexta
                     y |= bdata[index(nextb)] << (64 - offset(nextb))