Breaking changes for v1 (#35)

In this mega-commit, Kmers.jl is thoroughly overhauled, with a new API, new docs, and new types.
BioJulia · Dec 23, 2024 · 8fd37eb · 8fd37eb · jakobnissen · Dec 23, 2024
1 parent 0bdf3a8
commit 8fd37eb
Show file tree

Hide file tree

Showing 64 changed files with 4,097 additions and 2,890 deletions.
diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
@@ -0,0 +1,8 @@
+always_for_in = true
+whitespace_typedefs = true
+whitespace_ops_in_indices = true
+remove_extra_newlines = true
+import_to_using = true
+normalize_line_endings = "unix"
+separate_kwargs_with_semicolon = true
+whitespace_in_kwargs = false
diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml
@@ -10,17 +10,13 @@ on:
   pull_request:
 
 jobs:
-  build:
+  Documenter:
+    name: Documentation
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - uses: julia-actions/setup-julia@v1
-        with:
-          version: '1'
-      - name: Install dependencies
-        run: julia --color=yes --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
-      - name: Build and deploy
+      - uses: julia-actions/julia-buildpkg@latest
+      - uses: julia-actions/julia-docdeploy@latest
         env:
-          # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token
-          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key
-        run: julia --color=yes --project=docs/ docs/make.jl
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml
@@ -11,31 +11,30 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        julia-version:
-          - '1.6' # LTS
-          - '1'
-        julia-arch: [x86]
-        os: [ubuntu-latest, windows-latest, macOS-latest]
+        julia-version: ['1', '1.10']
+        os: [ubuntu-latest, macOS-latest, windows-latest]
         experimental: [false]
         include:
+          # Include nightly, but experimental, so it's allowed to fail without
+          # failing CI.
           - julia-version: nightly
-            julia-arch: x86
             os: ubuntu-latest
             experimental: true
+            fail_ci_if_error: false
 
     steps:
       - name: Checkout Repository
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Setup Julia
-        uses: julia-actions/setup-julia@v1
+        uses: julia-actions/setup-julia@latest
         with:
           version: ${{ matrix.julia-version }}
       - name: Run Tests
         uses: julia-actions/julia-runtest@latest
       - name: Create CodeCov
-        uses: julia-actions/julia-processcoverage@v1
+        uses: julia-actions/julia-processcoverage@latest
       - name: Upload CodeCov
-        uses: codecov/codecov-action@v1
+        uses: codecov/codecov-action@v4
         with:
           file: ./lcov.info
           flags: unittests

diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,6 @@
 *.jl.*.cov
 *.jl.mem
 .DS_Store
-Manifest.toml
+Manifest.toml
+TODO.md
+docs/build
diff --git a/Project.toml b/Project.toml
@@ -1,17 +1,33 @@
 name = "Kmers"
 uuid = "445028e4-d31f-4f27-89ad-17affd83fc22"
-authors = ["Sabrina Jaye Ward <[email protected]>"]
-version = "0.1.0"
+authors = [
+    "Jakob Nybo Nissen <[email protected]>",
+    "Sabrina Jaye Ward <[email protected]>"
+]
+version = "1.0.0"
+
+[weakdeps]
+StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
 
 [deps]
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
+BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
+
+[extensions]
+StringViewsExt = "StringViews"
 
+# Note: We intentionally have strict compat on BioSequences because Kmers
+# reaches into the internals of BioSequences.
 [compat]
-BioSequences = "3.1.3"
-julia = "1.5"
+BioSequences = "~3.4.1"
+Random = "1.10"
+julia = "1.10"
+StringViews = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
 
 [targets]
-test = ["Test"]
+test = ["Test", "Random", "StringViews"]
diff --git a/README.md b/README.md
@@ -3,57 +3,57 @@
 [![Latest Release](https://img.shields.io/github/release/BioJulia/Kmers.jl.svg)](https://github.com/BioJulia/Kmers.jl/releases/latest)
 [![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://github.com/BioJulia/Kmers.jl/blob/master/LICENSE)
 [![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://biojulia.github.io/Kmers.jl/stable)
-[![Pkg Status](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active)
-
 
 ## Description
+Kmers.jl provide the `Kmer <: BioSequence` type which implement the concept of a
+[k-mer](https://en.wikipedia.org/wiki/K-mer), a biological sequence of exactly length `k`.
 
-Kmers provides a specialised concrete `BioSequence` subtype, optimised for
-representing short immutable sequences called kmers: contiguous sub-strings of k
-nucleotides of some reference sequence.
 
-They are used extensively in bioinformatic analyses as an informational unit.
-This concept was popularised by short read assemblers. 
-Analyses within the kmer space benefit from a simple formulation of the sampling
-problem and direct in-hash comparisons.
+K-mers are used frequently in bioinformatics because, when k is small and known at
+compile time, these sequences can be efficiently represented as integers and stored
+directly in CPU registers, allowing for much more efficient computation than arbitrary-length sequences.
 
-Kmers provides the type representing kmers as well as the implementations of
-the APIs specified by the
-[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package.
+In Kmers.jl, the `Kmer` type is psrameterized by its length, and its data is stored in an `NTuple`. This makes `Kmers` bitstypes and highly efficient.
 
-## Installation
+Conceptually, one may use the following analogy:
+* `BioSequence` is like `AbstractVector`
+* `LongSequence` is like `Vector`
+* `Kmer` is like [`SVector`](https://github.com/JuliaArrays/StaticArrays.jl) from `StaticArrays`
+
+Kmers.jl is tightly coupled to the
+[`BioSequences.jl`](https://github.com/BioJulia/BioSequences.jl) package,
+and relies on its internals.
+Hence, you should expect strict compat bounds on BioSequences.jl.
+
+## Usage
+### ⚠️ WARNING ⚠️
+`Kmer`s are parameterized by their length. That means any operation on `Kmer`s that change their length, such as `push`, `pop`, slicing, or masking (logical indexing) will be **type unstable** and hence slow and memory inefficient, unless you write your code in such as way that the compiler can use constant folding.
 
+Further, as `Kmer`s are immutable and their operations are aggressively inlined and unrolled,
+they become inefficent as they get longer.
+For example, reverse-complementing a 32-mer takes 26 ns, compared to 102 ns for the equivalent `LongSequence`. However, for 512-mers, the `LongSequence` takes 126 ns, and the `Kmer` 16 μs!
+
+Kmers.jl is intended for high-performance computing. If you do not need the extra performance that register-stored sequences provide, you might consider using `LongSequence` from BioSequences.jl instead
+
+## Installation
 You can install BioSequences from the julia
 REPL. Press `]` to enter pkg mode, and enter the following:
 
 ```julia
-add Kmers
+pkg> add Kmers
 ```
 
-If you are interested in the cutting edge of the development, please check out
+If you are interested in the cutting edge of development, please check out
 the master branch to try new features before release.
 
-
-## Testing
-
-Kmers is tested against Julia `1.X` on Linux, OS X, and Windows.
-
-[![Unit tests](https://github.com/BioJulia/Kmers.jl/workflows/Unit%20tests/badge.svg?branch=master)](https://github.com/BioJulia/Kmers.jl/actions?query=workflow%3A%22Unit+tests%22+branch%3Amaster)
-[![Documentation](https://github.com/BioJulia/Kmers.jl/workflows/Documentation/badge.svg?branch=master)](https://github.com/BioJulia/BioKmers.jl/actions?query=workflow%3ADocumentation+branch%3Amaster)
-[![](https://codecov.io/gh/BioJulia/Kmers.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/BioJulia/Kmers.jl)
-
-
 ## Contributing
-
 We appreciate contributions from users including reporting bugs, fixing
 issues, improving performance and adding new features.
 
 Take a look at the [contributing files](https://github.com/BioJulia/Contributing)
 detailed contributor and maintainer guidelines, and code of conduct.
 
-
 ## Questions?
-
 If you have a question about contributing or using BioJulia software, come
-on over and chat to us on [Gitter](https://gitter.im/BioJulia/General), or you can try the
+on over and chat to us on [the Julia Slack workspace](https://julialang.org/slack/), or you can try the
 [Bio category of the Julia discourse site](https://discourse.julialang.org/c/domain/bio).
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,5 +1,9 @@
 [deps]
+BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
+Kmers = "445028e4-d31f-4f27-89ad-17affd83fc22"
+MinHash = "4b3c9753-2685-44e9-8a29-365b96c023ed"
 
 [compat]
-Documenter = "0.24"
+Documenter = "1"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,29 +1,33 @@
 using Documenter, Kmers
 
-makedocs(
-    format = Documenter.HTML(),
-    sitename = "Kmers.jl",
-    pages = [
-        "Home"                           => "index.md",
-        "Kmer types"                     => "kmer_types.md",
-        "Constructing kmers"             => "construction.md",
-        "Indexing & modifying kmers"     => "transforms.md",
-        "Predicates"                     => "predicates.md",
-        "Random kmers"                   => "random.md",
-        "Iterating over Kmers"           => "iteration.md",
-        "Translation"                    => "translate.md",
-        #"Pattern matching and searching" => "sequence_search.md",
-        #"Iteration"                      => "iteration.md",
-        #"Counting"                       => "counting.md",
-        #"I/O"                            => "io.md",
-        #"Interfaces"                     => "interfaces.md"
+DocMeta.setdocmeta!(
+    Kmers,
+    :DocTestSetup,
+    :(using BioSequences, Kmers, Test);
+    recursive=true,
+)
+
+makedocs(;
+    modules=[Kmers],
+    format=Documenter.HTML(; prettyurls=get(ENV, "CI", nothing) == "true"),
+    sitename="Kmers.jl",
+    pages=[
+        "Home" => "index.md",
+        "The Kmer type" => "kmers.md",
+        "Iteration" => "iteration.md",
+        "Translation" => "translation.md",
+        "Hashing" => "hashing.md",
+        "K-mer replacements" => "replacements.md",
+        "FAQ" => "faq.md",
+        "Cookbook" => ["MinHash" => "minhash.md", "Kmer composition" => "composition.md"],
     ],
-    authors = "Ben J. Ward, The BioJulia Organisation and other contributors."
+    authors="Jakob Nybo Nissen, Sabrina J. Ward, The BioJulia Organisation and other contributors.",
+    checkdocs=:exports,
 )
 
-deploydocs(
-    repo = "github.com/BioJulia/Kmers.jl.git",
-    push_preview = true,
-    deps = nothing,
-    make = nothing
+deploydocs(;
+    repo="github.com/BioJulia/Kmers.jl.git",
+    push_preview=true,
+    deps=nothing,
+    make=nothing,
 )
diff --git a/docs/src/composition.md b/docs/src/composition.md
@@ -0,0 +1,52 @@
+```@meta
+CurrentModule = Kmers
+DocTestSetup = quote
+    using BioSequences
+    using Test
+    using Kmers
+end
+```
+## Kmer composition
+In metagenomics, sequences are often summarized by counting the occurrence of
+all k-mers of a given length in a sequence.
+For example, for K=4, there are 4^4 = 256 possible DNA 4-mers.
+If these counts are ordered, the composition can be represented by a length 256
+vector.
+
+Vector similarity operations (e.g. cosine distance) can then be used as an
+approximate proxy for phylogenetic distance.
+
+In the example below, we exploit that:
+* A `DNAKmer{4}`'s data is a single-element tuple, which
+  stores the sequence in the 8 lower bits.
+* The `encoded_data` function will return this tuple.
+
+```jldoctest; output=false
+using BioSequences, FASTX, Kmers
+using BioSequences: encoded_data
+
+function composition(record::FASTARecord)
+    counts = zeros(UInt32, 256)
+    frequencies = zeros(Float32, 256)
+    for kmer in FwDNAMers{4}(sequence(record))
+        @inbounds counts[only(encoded_data(kmer)) + 1] += 1
+    end
+    factor = 1 / sum(counts; init=zero(eltype(counts)))
+    for i in eachindex(counts, frequencies)
+        frequencies[i] = counts[i] * factor
+    end
+    frequencies
+end
+
+# Make two FASTA records - could be from an assembly
+recs = [FASTARecord(string(i), randdnaseq(10000)) for i in "AB"]
+
+# Compute the 2-norm difference and verify it's in [0, 2].
+(comp_a, comp_b) = map(composition, recs)
+comp_distance = sum((comp_a .- comp_b).^2)
+println(0.0 ≤ comp_distance ≤ 2.0)
+
+# output
+true
+
+```
diff --git a/docs/src/construction.md b/docs/src/construction.md
diff --git a/docs/src/faq.md b/docs/src/faq.md
@@ -0,0 +1,40 @@
+```@meta
+CurrentModule = Kmers
+DocTestSetup = quote
+    using BioSequences
+    using Test
+    using Kmers
+end
+```
+## FAQ
+### Why can kmers not be compared to biosequences?
+It may be surprising that kmers cannot be compared to other biosequences:
+
+```jldoctest
+julia> dna"TAG" == mer"TAG"d
+ERROR: MethodError
+[...]
+```
+
+In fact, this is implemented by a manually thrown `MethodError`; the generic case `Base.:==(::BioSequence, ::BioSequence)` is defined.
+
+The reason for this is the consequence of the following limitations:
+* `isequal(x, y)` implies `hash(x) == hash(y)`
+* `isequal(x, y)` and `x == y` ought to be identical for well-defined elements (i.e. in the absence of `missing`s and `NaN`s etc.)
+* `hash(::Kmer)` must be absolutely maximally efficient
+
+If kmers were to be comparable to `BioSequence`, then the hashing of `BioSequence` should follow `Kmer`, which practically speaking would mean that all biosequences would need to be recoded to `Kmer`s before hashing.
+
+### Why isn't there an iterator of unambiguous, canonical kmers or spaced, canonical kmers?
+Any iterator of nucleotide kmers can be made into a canonical kmer iterator by simply calling `canonical` on its output kers.
+
+The `CanonicalKmers` iterator is special cased, because with a step size of 1, it is generally faster to build the next kmer by storing both the reverse and forward kmer, then creating the next kmer by prepending/append the next symbol.
+
+However, with a larger step size, it becomes more efficient to build the forward kmer, then reverse-complement the whole kmer.
+
+### Why isn't there an iterator of skipmers/minimizers/k-min-mers, etc?
+The concept of kmers have turned out to be remarkably flexible and useful in bioinformatics, and have spawned a neverending stream of variations.
+We simply can't implement them all.
+
+However, see the section [Building kmer replacements](@ref replacements) on how to implement them
+as a user of Kmers.jl yourself.