From a3bb32a97c23e44174e7e39070d94cc0707a8e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 20:30:36 +0200 Subject: [PATCH 01/44] Rename sablast to kbo. --- .github/deploy/build_aarch64-apple-darwin.sh | 6 ++-- .github/deploy/build_x86_64-apple-darwin.sh | 6 ++-- .github/workflows/build_artifacts.yml | 20 ++++++------ .github/workflows/make_release.yml | 26 +++++++-------- Cargo.toml | 8 ++--- README.md | 4 +-- src/cli/cli.rs | 2 +- src/cli/main.rs | 32 +++++++++--------- src/derandomize.rs | 20 ++++++------ src/format.rs | 4 +-- src/index.rs | 14 ++++---- src/lib.rs | 34 ++++++++++---------- src/translate.rs | 32 +++++++++--------- tests/map_clbs.rs | 6 ++-- 14 files changed, 107 insertions(+), 107 deletions(-) diff --git a/.github/deploy/build_aarch64-apple-darwin.sh b/.github/deploy/build_aarch64-apple-darwin.sh index a8c99c5..671b016 100644 --- a/.github/deploy/build_aarch64-apple-darwin.sh +++ b/.github/deploy/build_aarch64-apple-darwin.sh @@ -1,5 +1,5 @@ #!/bin/bash -## Build script for cross-compiling sablast for aarch64-apple-darwin. +## Build script for cross-compiling kbo for aarch64-apple-darwin. ## Run this inside https://github.com/shepherdjerred/macos-cross-compiler set -exo pipefail @@ -35,10 +35,10 @@ export CXX="aarch64-apple-darwin22-g++" RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --all-features --release --target aarch64-apple-darwin ## gather the stuff to distribute -target=sablast-candidate-aarch64-apple-darwin +target=kbo-candidate-aarch64-apple-darwin path=/io/tmp/$target mkdir $path -cp target/aarch64-apple-darwin/release/sablast $path/ +cp target/aarch64-apple-darwin/release/kbo $path/ cp README.md $path/ cp COPYRIGHT $path/ cp LICENSE-APACHE $path/ diff --git a/.github/deploy/build_x86_64-apple-darwin.sh b/.github/deploy/build_x86_64-apple-darwin.sh index d90e662..f4056e4 100644 --- a/.github/deploy/build_x86_64-apple-darwin.sh +++ b/.github/deploy/build_x86_64-apple-darwin.sh @@ -1,5 +1,5 @@ #!/bin/bash -## Build script for cross-compiling sablast for aarch64-apple-darwin. +## Build script for cross-compiling kbo for aarch64-apple-darwin. ## Run this inside https://github.com/shepherdjerred/macos-cross-compiler set -exo pipefail @@ -35,10 +35,10 @@ export CXX="x86_64-apple-darwin22-g++" RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --all-features --release --target x86_64-apple-darwin ## gather the stuff to distribute -target=sablast-candidate-x86_64-apple-darwin +target=kbo-candidate-x86_64-apple-darwin path=/io/tmp/$target mkdir $path -cp target/x86_64-apple-darwin/release/sablast $path/ +cp target/x86_64-apple-darwin/release/kbo $path/ cp README.md $path/ cp COPYRIGHT $path/ cp LICENSE-APACHE $path/ diff --git a/.github/workflows/build_artifacts.yml b/.github/workflows/build_artifacts.yml index 9bdf339..39c1a88 100644 --- a/.github/workflows/build_artifacts.yml +++ b/.github/workflows/build_artifacts.yml @@ -15,7 +15,7 @@ jobs: uses: rust-build/rust-build.action@v1.4.5 with: ARCHIVE_TYPES: tar.gz - ARCHIVE_NAME: sablast-candidate-x86_64-unknown-linux-musl + ARCHIVE_NAME: kbo-candidate-x86_64-unknown-linux-musl RUSTTARGET: x86_64-unknown-linux-musl EXTRA_FEATURES: "cli" EXTRA_FILES: "COPYRIGHT LICENSE-APACHE LICENSE-MIT README.md" @@ -25,7 +25,7 @@ jobs: - name: Upload x86_64-unknown-linux-musl uses: actions/upload-artifact@v4 with: - name: sablast-candidate-x86_64-unknown-linux-musl + name: kbo-candidate-x86_64-unknown-linux-musl path: | ${{ steps.compile.outputs.BUILT_ARCHIVE }} ${{ steps.compile.outputs.BUILT_CHECKSUM }} @@ -54,15 +54,15 @@ jobs: if: success() uses: actions/upload-artifact@v4 with: - name: sablast-candidate-x86_64-apple-darwin - path: /io/sablast-candidate-x86_64-apple-darwin.tar.gz + name: kbo-candidate-x86_64-apple-darwin + path: /io/kbo-candidate-x86_64-apple-darwin.tar.gz - name: Upload x86_64-apple-darwin sha256sum if: success() uses: actions/upload-artifact@v4 with: - name: sablast-candidate-x86_64-apple-darwin-sha256sum - path: /io/sablast-candidate-x86_64-apple-darwin.tar.gz.sha256sum + name: kbo-candidate-x86_64-apple-darwin-sha256sum + path: /io/kbo-candidate-x86_64-apple-darwin.tar.gz.sha256sum build_macOS-aarch64: runs-on: ubuntu-latest @@ -88,12 +88,12 @@ jobs: if: success() uses: actions/upload-artifact@v4 with: - name: sablast-candidate-aarch64-apple-darwin - path: /io/sablast-candidate-aarch64-apple-darwin.tar.gz + name: kbo-candidate-aarch64-apple-darwin + path: /io/kbo-candidate-aarch64-apple-darwin.tar.gz - name: Upload aarch64-apple-darwin sha256sum if: success() uses: actions/upload-artifact@v4 with: - name: sablast-candidate-aarch64-apple-darwin-sha256sum - path: /io/sablast-candidate-aarch64-apple-darwin.tar.gz.sha256sum + name: kbo-candidate-aarch64-apple-darwin-sha256sum + path: /io/kbo-candidate-aarch64-apple-darwin.tar.gz.sha256sum diff --git a/.github/workflows/make_release.yml b/.github/workflows/make_release.yml index 49ceb58..c27efb1 100644 --- a/.github/workflows/make_release.yml +++ b/.github/workflows/make_release.yml @@ -12,7 +12,7 @@ jobs: create-release: name: Create and publish release runs-on: ubuntu-latest - environment: Make sablast release + environment: Make kbo release steps: - name: Download artifacts @@ -24,22 +24,22 @@ jobs: - name: Organise files shell: bash run: | - mv sablast-candidate-x86_64-unknown-linux-musl/sablast-candidate-x86_64-unknown-linux-musl.tar.gz ./ - mv sablast-candidate-x86_64-apple-darwin/sablast-candidate-x86_64-apple-darwin.tar.gz ./ - mv sablast-candidate-aarch64-apple-darwin/sablast-candidate-aarch64-apple-darwin.tar.gz ./ + mv kbo-candidate-x86_64-unknown-linux-musl/kbo-candidate-x86_64-unknown-linux-musl.tar.gz ./ + mv kbo-candidate-x86_64-apple-darwin/kbo-candidate-x86_64-apple-darwin.tar.gz ./ + mv kbo-candidate-aarch64-apple-darwin/kbo-candidate-aarch64-apple-darwin.tar.gz ./ - name: Rename mac artifacts shell: bash run: | - tar -zxvf sablast-candidate-x86_64-apple-darwin.tar.gz && mv sablast-candidate-x86_64-apple-darwin sablast-${{ github.ref_name }}-x86_64-apple-darwin && tar -zcvf sablast-candidate-x86_64-apple-darwin.tar.gz sablast-${{ github.ref_name }}-x86_64-apple-darwin - tar -zxvf sablast-candidate-aarch64-apple-darwin.tar.gz && mv sablast-candidate-aarch64-apple-darwin sablast-${{ github.ref_name }}-aarch64-apple-darwin && tar -zcvf sablast-candidate-aarch64-apple-darwin.tar.gz sablast-${{ github.ref_name }}-aarch64-apple-darwin + tar -zxvf kbo-candidate-x86_64-apple-darwin.tar.gz && mv kbo-candidate-x86_64-apple-darwin kbo-${{ github.ref_name }}-x86_64-apple-darwin && tar -zcvf kbo-candidate-x86_64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-x86_64-apple-darwin + tar -zxvf kbo-candidate-aarch64-apple-darwin.tar.gz && mv kbo-candidate-aarch64-apple-darwin kbo-${{ github.ref_name }}-aarch64-apple-darwin && tar -zcvf kbo-candidate-aarch64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-aarch64-apple-darwin - name: Designate version shell: bash run: | - mv sablast-candidate-x86_64-unknown-linux-musl.tar.gz sablast-${{ github.ref_name }}-x86_64-linux-musl.tar.gz - mv sablast-candidate-x86_64-apple-darwin.tar.gz sablast-${{ github.ref_name }}-x86_64-apple-darwin.tar.gz - mv sablast-candidate-aarch64-apple-darwin.tar.gz sablast-${{ github.ref_name }}-aarch64-apple-darwin.tar.gz + mv kbo-candidate-x86_64-unknown-linux-musl.tar.gz kbo-${{ github.ref_name }}-x86_64-linux-musl.tar.gz + mv kbo-candidate-x86_64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-x86_64-apple-darwin.tar.gz + mv kbo-candidate-aarch64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-aarch64-apple-darwin.tar.gz - name: Get current date id: date @@ -49,12 +49,12 @@ jobs: uses: softprops/action-gh-release@v2 with: token: ${{ secrets.MAKE_RELEASE_TOKEN }} - name: sablast-${{ github.ref_name }} (${{ steps.date.outputs.date }}) + name: kbo-${{ github.ref_name }} (${{ steps.date.outputs.date }}) draft: false prerelease: false fail_on_unmatched_files: true generate_release_notes: true files: | - sablast-${{ github.ref_name }}-x86_64-linux-musl.tar.gz - sablast-${{ github.ref_name }}-x86_64-apple-darwin.tar.gz - sablast-${{ github.ref_name }}-aarch64-apple-darwin.tar.gz \ No newline at end of file + kbo-${{ github.ref_name }}-x86_64-linux-musl.tar.gz + kbo-${{ github.ref_name }}-x86_64-apple-darwin.tar.gz + kbo-${{ github.ref_name }}-aarch64-apple-darwin.tar.gz \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index d856535..d9c30a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,13 @@ [package] -name = "sablast" +name = "kbo" version = "0.3.0" edition = "2021" rust-version = "1.77.0" authors = ["Tommi Mäklin "] description = "Spectral Burrows-Wheeler transform accelerated local alignment search" readme = "README.md" -homepage = "https://github.com/tmaklin/sablast" -repository = "https://github.com/tmaklin/sablast" +homepage = "https://github.com/tmaklin/kbo" +repository = "https://github.com/tmaklin/kbo" license = "MIT OR Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -16,7 +16,7 @@ license = "MIT OR Apache-2.0" cli = ["dep:needletail", "dep:clap", "dep:rayon", "dep:log", "dep:stderrlog"] [[bin]] -name = "sablast" +name = "kbo" path = "src/cli/main.rs" required-features = ["cli"] diff --git a/README.md b/README.md index bf31a98..b48b344 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# sablast +# kbo Spectral Burrows-Wheeler transform accelerated local alignment search. ## License -sablast is dual-licensed under the [MIT](LICENSE-MIT) and [Apache 2.0](LICENSE-APACHE) licenses. +kbo is dual-licensed under the [MIT](LICENSE-MIT) and [Apache 2.0](LICENSE-APACHE) licenses. diff --git a/src/cli/cli.rs b/src/cli/cli.rs index 7a7d5be..ea01cee 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. diff --git a/src/cli/main.rs b/src/cli/main.rs index fdf4b3d..63de39b 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. @@ -51,7 +51,7 @@ fn init_log(log_max_level: usize) { .unwrap(); } -/// Use `sablast` to list the available commands or `sablast ` to run. +/// Use `kbo` to list the available commands or `kbo ` to run. /// /// # Input format detection /// The sequence data is read using @@ -65,7 +65,7 @@ fn init_log(log_max_level: usize) { /// [Bzip2](https://sourceware.org/bzip2/) and /// [liblzma](https://tukaani.org/xz/) compression (.bz2 and .xz /// files) can be enabled using the needletail features field in -/// sablast Cargo.toml if compiling from source. +/// kbo Cargo.toml if compiling from source. /// #[allow(clippy::needless_update)] fn main() { @@ -86,7 +86,7 @@ fn main() { }) => { init_log(if *verbose { 2 } else { 1 }); - let mut sbwt_build_options = sablast::index::BuildOpts::default(); + let mut sbwt_build_options = kbo::index::BuildOpts::default(); sbwt_build_options.k = *kmer_size; sbwt_build_options.num_threads = *num_threads; sbwt_build_options.prefix_precalc = *prefix_precalc; @@ -100,11 +100,11 @@ fn main() { seq_data.append(&mut read_fastx_file(file)); }); - let (sbwt, lcs) = sablast::build(&seq_data, sbwt_build_options); + let (sbwt, lcs) = kbo::build(&seq_data, sbwt_build_options); info!("Serializing SBWT index to {}.sbwt ...", output_prefix.as_ref().unwrap()); info!("Serializing LCS array to {}.lcs ...", output_prefix.as_ref().unwrap()); - sablast::index::serialize_sbwt(output_prefix.as_ref().unwrap(), &sbwt, &lcs); + kbo::index::serialize_sbwt(output_prefix.as_ref().unwrap(), &sbwt, &lcs); }, Some(cli::Commands::Find { @@ -121,7 +121,7 @@ fn main() { verbose, }) => { init_log(if *verbose { 2 } else { 1 }); - let mut sbwt_build_options = sablast::index::BuildOpts::default(); + let mut sbwt_build_options = kbo::index::BuildOpts::default(); sbwt_build_options.k = *kmer_size; sbwt_build_options.num_threads = *num_threads; sbwt_build_options.prefix_precalc = *prefix_precalc; @@ -129,16 +129,16 @@ fn main() { sbwt_build_options.mem_gb = *mem_gb; sbwt_build_options.temp_dir = temp_dir.clone(); - let mut derand_opts = sablast::derandomize::DerandomizeOpts::default(); + let mut derand_opts = kbo::derandomize::DerandomizeOpts::default(); derand_opts.max_error_prob = *max_error_prob; let ((sbwt, lcs), ref_name) = if index_prefix.is_some() && !ref_file.is_some() { info!("Loading SBWT index..."); - (sablast::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.as_ref().unwrap()) + (kbo::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.as_ref().unwrap()) } else if !index_prefix.is_some() && ref_file.is_some() { info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); let ref_data = read_fastx_file(ref_file.as_ref() .unwrap()); - (sablast::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.as_ref().unwrap()) + (kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.as_ref().unwrap()) } else { panic!("Ambiguous reference, supply only one of `-r/--reference` and `-i/--index`"); }; @@ -161,10 +161,10 @@ fn main() { let seq = seqrec.normalize(true); // Get local alignments for forward strand - let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = sablast::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); // Add local alignments for reverse _complement - run_lengths.append(&mut sablast::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); + run_lengths.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); // Sort by q.start run_lengths.sort_by_key(|x| x.0); @@ -191,7 +191,7 @@ fn main() { verbose, }) => { init_log(if *verbose { 2 } else { 1 }); - let mut sbwt_build_options = sablast::index::BuildOpts::default(); + let mut sbwt_build_options = kbo::index::BuildOpts::default(); // These are required for the subcommand to work correctly sbwt_build_options.add_revcomp = true; sbwt_build_options.build_select = true; @@ -203,7 +203,7 @@ fn main() { sbwt_build_options.mem_gb = *mem_gb; sbwt_build_options.temp_dir = temp_dir.clone(); - let mut derand_opts = sablast::derandomize::DerandomizeOpts::default(); + let mut derand_opts = kbo::derandomize::DerandomizeOpts::default(); derand_opts.max_error_prob = *max_error_prob; rayon::ThreadPoolBuilder::new() @@ -217,11 +217,11 @@ fn main() { let stdout = std::io::stdout(); query_files.par_iter().for_each(|query_file| { let query_data = read_fastx_file(query_file); - let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); + let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); let mut res: Vec = Vec::new(); ref_data.iter().for_each(|ref_contig| { - res.append(&mut sablast::map(ref_contig, &sbwt, &lcs, derand_opts.clone())); + res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, derand_opts.clone())); }); let _ = writeln!(&mut stdout.lock(), diff --git a/src/derandomize.rs b/src/derandomize.rs index af68f6c..fd0040d 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. @@ -28,9 +28,9 @@ pub struct DerandomizeOpts { impl Default for DerandomizeOpts { /// Default to these values: /// ```rust - /// let mut opts = sablast::derandomize::DerandomizeOpts::default(); + /// let mut opts = kbo::derandomize::DerandomizeOpts::default(); /// opts.max_error_prob = 0.0000001; - /// # let expected = sablast::derandomize::DerandomizeOpts::default(); + /// # let expected = kbo::derandomize::DerandomizeOpts::default(); /// # assert_eq!(opts.max_error_prob, expected.max_error_prob); /// ``` /// @@ -95,7 +95,7 @@ impl Default for DerandomizeOpts { /// # Examples /// ```rust /// # use assert_approx_eq::assert_approx_eq; -/// use sablast::derandomize::log_rm_max_cdf; +/// use kbo::derandomize::log_rm_max_cdf; /// /// let alphabet_size = 4; /// let n_kmers = 20240921; @@ -139,7 +139,7 @@ pub fn log_rm_max_cdf( /// /// # Examples /// ```rust -/// use sablast::derandomize::random_match_threshold; +/// use kbo::derandomize::random_match_threshold; /// /// let k = 31; /// let n_kmers = 20240921; @@ -186,7 +186,7 @@ pub fn random_match_threshold( /// # Examples /// ## Noisy MS has only matches /// ```rust -/// use sablast::derandomize::derandomize_ms_val; +/// use kbo::derandomize::derandomize_ms_val; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -201,7 +201,7 @@ pub fn random_match_threshold( /// /// ## Noisy MS has only noise /// ```rust -/// use sablast::derandomize::derandomize_ms_val; +/// use kbo::derandomize::derandomize_ms_val; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -216,7 +216,7 @@ pub fn random_match_threshold( /// /// ## Noisy MS is at beginning of a full _k_-mer match /// ```rust -/// use sablast::derandomize::derandomize_ms_val; +/// use kbo::derandomize::derandomize_ms_val; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -231,7 +231,7 @@ pub fn random_match_threshold( /// /// ## Noisy MS is at beginning of a partial _k_-mer match /// ```rust -/// use sablast::derandomize::derandomize_ms_val; +/// use kbo::derandomize::derandomize_ms_val; /// /// // Parameters : k = 4, threshold = 2 /// // @@ -281,7 +281,7 @@ pub fn derandomize_ms_val( /// /// # Examples /// ```rust -/// use sablast::derandomize::derandomize_ms_vec; +/// use kbo::derandomize::derandomize_ms_vec; /// /// let k = 3; /// let threshold = 2; diff --git a/src/format.rs b/src/format.rs index e22bd2c..ae1d73c 100644 --- a/src/format.rs +++ b/src/format.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. @@ -32,7 +32,7 @@ /// # Examples /// ## Extract run lengths from a character representation /// ```rust -/// use sablast::format::run_lengths; +/// use kbo::format::run_lengths; /// /// // Parameters : k = 3, threshold = 2 /// // diff --git a/src/index.rs b/src/index.rs index 02f919b..96decd5 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. @@ -47,7 +47,7 @@ pub struct BuildOpts { impl Default for BuildOpts { /// Default to these values: /// ```rust - /// let mut opts = sablast::index::BuildOpts::default(); + /// let mut opts = kbo::index::BuildOpts::default(); /// opts.k = 31; /// opts.add_revcomp = false; /// opts.num_threads = 1; @@ -56,7 +56,7 @@ impl Default for BuildOpts { /// opts.mem_gb = 4; /// opts.dedup_batches = false; /// opts.temp_dir = None; - /// # let expected = sablast::index::BuildOpts::default(); + /// # let expected = kbo::index::BuildOpts::default(); /// # assert_eq!(opts.k, expected.k); /// # assert_eq!(opts.add_revcomp, expected.add_revcomp); /// # assert_eq!(opts.num_threads, expected.num_threads); @@ -98,7 +98,7 @@ impl Default for BuildOpts { /// /// # Examples /// ```rust -/// use sablast::index::*; +/// use kbo::index::*; /// /// // Inputs /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; @@ -147,7 +147,7 @@ pub fn build_sbwt_from_vecs( /// /// # Examples /// ```rust -/// use sablast::index::*; +/// use kbo::index::*; /// /// // Inputs /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; @@ -200,7 +200,7 @@ pub fn serialize_sbwt( /// /// # Examples /// ```rust -/// use sablast::index::*; +/// use kbo::index::*; /// /// // Inputs /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; @@ -258,7 +258,7 @@ pub fn load_sbwt( /// /// # Examples /// ```rust -/// use sablast::index::*; +/// use kbo::index::*; /// /// // Inputs /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; diff --git a/src/lib.rs b/src/lib.rs index 798cd94..0fa9b75 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. @@ -33,11 +33,11 @@ pub mod translate; /// with the parameters and resources specified in `build_opts` (see /// [index::BuildOpts] for details). /// -/// Prebuilt indexes can currently only be used with sablast find. +/// Prebuilt indexes can currently only be used with kbo find. /// /// All files and sequence data in `seq_files` are merged into the /// same index. It is not possible extract the individual sequences -/// from the index after it has been built; run `sablast map -r +/// from the index after it has been built; run `kbo map -r /// ` if you need to know which reference /// sequences the alignments are for. /// @@ -51,8 +51,8 @@ pub mod translate; /// /// # Examples /// ```rust -/// use sablast::build; -/// use sablast::index::BuildOpts; +/// use kbo::build; +/// use kbo::index::BuildOpts; /// /// let inputs: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// @@ -87,10 +87,10 @@ pub fn build( /// /// # Example /// ```rust -/// use sablast::build; -/// use sablast::matches; -/// use sablast::index::BuildOpts; -/// use sablast::derandomize::DerandomizeOpts; +/// use kbo::build; +/// use kbo::matches; +/// use kbo::index::BuildOpts; +/// use kbo::derandomize::DerandomizeOpts; /// /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -133,10 +133,10 @@ pub fn matches( /// /// # Examples /// ```rust -/// use sablast::build; -/// use sablast::map; -/// use sablast::index::BuildOpts; -/// use sablast::derandomize::DerandomizeOpts; +/// use kbo::build; +/// use kbo::map; +/// use kbo::index::BuildOpts; +/// use kbo::derandomize::DerandomizeOpts; /// /// let query: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -188,10 +188,10 @@ pub fn map( /// /// # Examples /// ```rust -/// use sablast::build; -/// use sablast::find; -/// use sablast::index::BuildOpts; -/// use sablast::derandomize::DerandomizeOpts; +/// use kbo::build; +/// use kbo::find; +/// use kbo::index::BuildOpts; +/// use kbo::derandomize::DerandomizeOpts; /// /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); diff --git a/src/translate.rs b/src/translate.rs index e216ced..09c27e0 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. @@ -43,7 +43,7 @@ use sbwt::SubsetSeq; /// # Examples /// ## Query with only matches /// ```rust -/// use sablast::translate::translate_ms_val; +/// use kbo::translate::translate_ms_val; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -62,7 +62,7 @@ use sbwt::SubsetSeq; /// /// ## Query with a single mismatch /// ```rust -/// use sablast::translate::translate_ms_val; +/// use kbo::translate::translate_ms_val; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -81,7 +81,7 @@ use sbwt::SubsetSeq; /// /// ## Query with a single insertion: /// ```rust -/// use sablast::translate::translate_ms_val; +/// use kbo::translate::translate_ms_val; /// /// // Ref sequence : A,C,G,-,C,A,G /// // Query sequence : A,C,G,C,C,A,G @@ -103,7 +103,7 @@ use sbwt::SubsetSeq; /// /// ## Query with multiple insertions: /// ```rust -/// use sablast::translate::translate_ms_val; +/// use kbo::translate::translate_ms_val; /// /// // Ref sequence : A,C,G, -,-,C,A,G /// // Query sequence : A,C,G, T,T,C,C,A,G @@ -120,7 +120,7 @@ use sbwt::SubsetSeq; /// ``` /// ## Query with a deletion /// ```rust -/// use sablast::translate::translate_ms_val; +/// use kbo::translate::translate_ms_val; /// /// /// // Parameters : k = 3, threshold = 2 @@ -146,7 +146,7 @@ use sbwt::SubsetSeq; /// /// ## Query with recombination /// ```rust -/// use sablast::translate::translate_ms_val; +/// use kbo::translate::translate_ms_val; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -226,7 +226,7 @@ pub fn translate_ms_val( /// # Examples /// ## Translate a generic MS vector /// ```rust -/// use sablast::translate::translate_ms_vec; +/// use kbo::translate::translate_ms_vec; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -243,7 +243,7 @@ pub fn translate_ms_val( /// /// ## Translate a MS vector with recombination /// ```rust -/// use sablast::translate::translate_ms_vec; +/// use kbo::translate::translate_ms_vec; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -315,13 +315,13 @@ pub fn translate_ms_vec( /// /// # Examples /// ```rust -/// use sablast::build; -/// use sablast::index::BuildOpts; -/// use sablast::index::query_sbwt; -/// use sablast::derandomize::derandomize_ms_vec; -/// use sablast::derandomize::random_match_threshold; -/// use sablast::translate::translate_ms_vec; -/// use sablast::translate::refine_translation; +/// use kbo::build; +/// use kbo::index::BuildOpts; +/// use kbo::index::query_sbwt; +/// use kbo::derandomize::derandomize_ms_vec; +/// use kbo::derandomize::random_match_threshold; +/// use kbo::translate::translate_ms_vec; +/// use kbo::translate::refine_translation; /// use sbwt::SbwtIndexVariant; /// /// // Parameters : k = 4, threshold = 3 diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 7402580..0e702d2 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -1,4 +1,4 @@ -// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search // // Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. @@ -27,7 +27,7 @@ // } // } -// let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&seq_data, &None); +// let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&seq_data, &None); // let expected = vec![(455, 967, 512, 1)]; @@ -36,7 +36,7 @@ // let seqrec = rec.expect("Valid fastX record"); // let seq = seqrec.normalize(true); -// let got = sablast::find(&seq, &sbwt, &lcs); +// let got = kbo::find(&seq, &sbwt, &lcs); // assert_eq!(got, expected); // } From 150ff6831513c59c4a360ec5313ad32dff2a62f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 12:42:56 +0200 Subject: [PATCH 02/44] Tabs -> spaces in refine_translation. --- src/translate.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index 09c27e0..cd1cbea 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -370,11 +370,11 @@ pub fn refine_translation( assert!(!translation.is_empty()); assert!(translation.len() == noisy_ms.len()); let k = match query_sbwt { - SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - assert!(sbwt.sbwt().has_select_support()); - assert!(sbwt.k() > 0); - sbwt.k() - }, + SbwtIndexVariant::SubsetMatrix(ref sbwt) => { + assert!(sbwt.sbwt().has_select_support()); + assert!(sbwt.k() > 0); + sbwt.k() + }, }; // Could prove midpoint is the best guess using conditional probabilities? @@ -382,14 +382,14 @@ pub fn refine_translation( let mut refined = translation.to_vec().clone(); match query_sbwt { - SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - for i in 1..(refined.len() - threshold) { - if refined[i - 1] == 'X' { - let midpoint = if i + k - 2 < n_elements && noisy_ms[i + k - 2].0 == k - 1 { k/2 } else { (threshold + 1)/2 }; - refined[i - 1] = sbwt.access_kmer(noisy_ms[i + k - 2 - midpoint].1.start)[midpoint] as char; - } - } - }, + SbwtIndexVariant::SubsetMatrix(ref sbwt) => { + for i in 1..(refined.len() - threshold) { + if refined[i - 1] == 'X' { + let midpoint = if i + k - 2 < n_elements && noisy_ms[i + k - 2].0 == k - 1 { k/2 } else { (threshold + 1)/2 }; + refined[i - 1] = sbwt.access_kmer(noisy_ms[i + k - 2 - midpoint].1.start)[midpoint] as char; + } + } + }, }; refined } From 95eba779968c27ffb645ae5999040c217bb76a31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 12:53:51 +0200 Subject: [PATCH 03/44] Remove unused import. --- src/translate.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/translate.rs b/src/translate.rs index cd1cbea..453a4a8 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -546,7 +546,6 @@ mod tests { use crate::index::BuildOpts; use crate::index::query_sbwt; use crate::derandomize::derandomize_ms_vec; - use crate::derandomize::random_match_threshold; use super::translate_ms_vec; use super::refine_translation; use sbwt::SbwtIndexVariant; From 8e89ea03ca77b608fd82703530a07694011c45d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 12:53:56 +0200 Subject: [PATCH 04/44] Add doc tests for relative_to_ref (1/2 pass). --- src/format.rs | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/src/format.rs b/src/format.rs index ae1d73c..951387b 100644 --- a/src/format.rs +++ b/src/format.rs @@ -100,10 +100,43 @@ pub fn run_lengths( /// unresolved SNPs, insertions, or absent sections. /// /// # Examples -/// ## Format a refined translation +/// ## Format an unrefined translation +/// Unrefined translations do not resolve SNPs or insertions in query, instead +/// replacing them with gaps '-'. +/// ```rust +/// use kbo::translate::translate_ms_vec; +/// use kbo::format::relative_to_ref; /// -/// TODO Add test to relative_to_ref documentation. +/// // Parameters : k = 4, threshold = 3 +/// // +/// // Ref sequence : T,T,G,A, T,T,G,G,C,T,G,G,G,C,A,G,A,G,C,T,G +/// // Query sequence : T,T,G,A, G,G,C,T,G,G,G,G,A,G,A,G,C,T,G +/// // +/// // Result MS vector : 1,2,3,4, 1,2,3,3,3,4,4,4,4,3,1,2,3,4,4,4,4 +/// // Derandomized MS : 1,2,3,4,-1,0,1,2,3,4,4,4,4,0,1,2,3,4,4,4,4 +/// // Translation : M,M,M,M, -,-,M,M,M,M,M,M,M,X,M,M,M,M,M,M,M +/// // Formatted : T,T,G,A, -,-,G,G,C,T,G,G,G,-,A,G,A,G,C,T,G +/// +/// let reference: Vec = vec![b'T',b'T',b'G',b'A',b'T',b'T',b'G',b'G',b'C',b'T',b'G',b'G',b'G',b'C',b'A',b'G',b'A',b'G',b'C',b'T',b'G']; +/// let derand_ms: Vec = vec![1,2,3,4,-1,0,1,2,3,4,4,4,4,0,1,2,3,4,4,4,4]; +/// let translated = translate_ms_vec(&derand_ms, 4, 3); +/// +/// let relative = relative_to_ref(&reference, &translated); +/// // `relative` has [T,T,G,A,-,-,G,G,C,T,G,G,G,-,A,G,A,G,C,T,G] +/// # let expected = vec![b'T',b'T',b'G',b'A',b'-',b'-',b'G',b'G',b'C',b'T',b'G',b'G',b'G',b'-',b'A',b'G',b'A',b'G',b'C',b'T',b'G']; +/// # assert_eq!(relative, expected); +/// ``` /// +/// ## Format a refined translation +/// ```rust +/// use kbo::format::relative_to_ref; +/// let reference: Vec = vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']; +/// let refined: Vec = vec!['C','M','M','R','-','-','R','M','M','C','M','M','M','M','M','M','-','-']; +/// +/// let relative = relative_to_ref(&reference, &refined); +/// # let expected = vec![b'C',b'A',b'A',b'G',b'-',b'-',b'C',b'C',b'A',b'T',b'C',b'A',b'G',b'G',b'G',b'-',b'-']; +/// // relative.iter().zip(reference.iter()).zip(expected.iter()).for_each(|((x1,x2),x3)| eprintln!("{}\t{}\t{}", *x1 as char, *x2 as char, *x3 as char)); +/// # assert_eq!(relative, expected); /// ``` /// pub fn relative_to_ref( From c3166fc033b067007ace36308d9624a19d453ce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 13:28:50 +0200 Subject: [PATCH 05/44] Resolve TODO question (A: works with any). --- src/format.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/format.rs b/src/format.rs index 951387b..cac9984 100644 --- a/src/format.rs +++ b/src/format.rs @@ -82,8 +82,6 @@ pub fn run_lengths( /// character representation of an alignment `alignment` to determine the /// nucleotide sequence of the alignment relative to the reference. /// -/// Only works with refined translations as the input (TODO verify). -/// /// Returns a vector containing alignment of the query against `ref_seq` in a /// gapped alignment format. /// From 6bc701877f411b3e3fa7a476dd415a4165e4eb18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 15:24:04 +0200 Subject: [PATCH 06/44] Add initial crate documentaiton. --- src/lib.rs | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 0fa9b75..13b8ed1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,8 +11,75 @@ // the MIT license, or , // at your option. // -//! Crate documentation + +//! kbo is an approximate local aligner based on converting [_k_-bounded matching +//! statistics](https://www.biorxiv.org/content/10.1101/2024.02.19.580943v1) +//! into a character representation of the underlying alignment sequence. +//! +//! Currently, kbo supports two main operations: +//! +//! - `kbo find` [matches](matches()) the _k_-mers in a query sequence with the +//! reference and reports the local alignment segments found within the +//! reference. Find is useful for problems that can be solved with +//! [blast](https://blast.ncbi.nlm.nih.gov/Blast.cgi). +//! - `kbo map` [maps](map()) the query sequence against a reference +//! sequence, and reports the nucleotide sequence of the alignment relative to +//! the reference. Map solves the same problem as +//! [snippy](https://github.com/tseemann/snippy) and [ska +//! map](https://docs.rs/ska/latest/ska/#ska-map). +//! +//! kbo uses the [Spectral Burrows-Wheeler +//! Transform](https://docs.rs/sbwt/latest/sbwt/) data structure that allows +//! efficient _k_-mer matching between a target and a query sequence and +//! fast retrieval of the _k_-bounded matching statistic for each _k_-mer match. +//! +//! # Installing the kbo executable +//! Run `cargo build --features cli --release`. +//! +//! # Usage +//! +//! kbo can be run directly on fasta files without an initial indexing step. +//! Prebuilt indexes are supported via `kbo build` but are only +//! relevant in `kbo find` analyses where the reference _k_-mers can be +//! concatenated into a single contig. //! +//! ## kbo find +//! +//! To set up the example, download the fasta sequence of the [_Escherichia +//! coli_ Nissle +//! 1917](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000714595.1/) genome +//! from the NCBI and the [pks +//! island](https://raw.githubusercontent.com/tmaklin/clbtype/refs/heads/main/db/db.fasta) +//! gene sequences from GitHub. Example output was generated with versions +//! ASM71459v1 and rev 43bbd36. +//! +//! ### Find gene sequence locations +//! In the directory containing the input files, run +//! ```text +//! kbo find --reference db.fasta GCF_000714595.1_ASM71459v1_genomic.fna +//! ``` +//! This will produce the output +//! +//! ```text +//! TODO add output +//! ``` +//! +//! ### Find presence/absence of gene sequences +//! Alternatively, if you are only interested in containment of the `db.fasta` genes in the assembly, run +//! ```text +//! kbo find --reference GCF_000714595.1_ASM71459v1_genomic.fna db.fasta +//! ``` +//! which will return +//! ```text +//! TODO add output +//! ```text +//! ## kbo map +//! TODO write +//! ``` +//! +//! ``` +//! + #![warn(missing_docs, missing_debug_implementations, missing_copy_implementations, trivial_casts, trivial_numeric_casts, From 2a9b1cd62bf766e5341ca80ab84218954c6425e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 15:33:55 +0200 Subject: [PATCH 07/44] Restructure read_from_fastx_parser --- src/cli/main.rs | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 63de39b..011e57c 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -16,27 +16,37 @@ use std::io::Write; use clap::Parser; use log::info; use needletail::Sequence; +use needletail::parser::SequenceRecord; use rayon::iter::ParallelIterator; use rayon::iter::IntoParallelRefIterator; // Command-line interface mod cli; +// Given a needletail parser, reads the next contig sequence +fn read_from_fastx_parser( + reader: &mut dyn needletail::parser::FastxReader, +) -> Option { + let rec = reader.next(); + match rec { + Some(Ok(seqrec)) => { + Some(seqrec) + }, + None => None, + Some(Err(_)) => todo!(), + } +} + // Reads all sequence data from a fastX file fn read_fastx_file( file: &str, ) -> Vec> { let mut seq_data: Vec> = Vec::new(); let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); - loop { - let rec = reader.next(); - match rec { - Some(Ok(seqrec)) => { - seq_data.push(seqrec.normalize(true).as_ref().to_vec()); - }, - _ => break + while let Some(rec) = read_from_fastx_parser(&mut *reader) { + let seqrec = rec.normalize(true); + seq_data.push(seqrec.to_vec()); } - } seq_data } @@ -155,8 +165,7 @@ fn main() { query_files.par_iter().for_each(|file| { let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); - while let Some(rec) = reader.next() { - let seqrec = rec.expect("Valid fastX record"); + while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { let contig = seqrec.id(); let seq = seqrec.normalize(true); From d35eebc3a39fd9dec67034520dd74e721dbc13e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 15:41:15 +0200 Subject: [PATCH 08/44] Rewrite find (allow processign contigs separately) --- src/cli/main.rs | 56 +++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 011e57c..29546c4 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -142,13 +142,17 @@ fn main() { let mut derand_opts = kbo::derandomize::DerandomizeOpts::default(); derand_opts.max_error_prob = *max_error_prob; - let ((sbwt, lcs), ref_name) = if index_prefix.is_some() && !ref_file.is_some() { + let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String)> = Vec::new(); + + if index_prefix.is_some() && !ref_file.is_some() { info!("Loading SBWT index..."); - (kbo::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.as_ref().unwrap()) + indexes.push((kbo::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.clone().unwrap())); } else if !index_prefix.is_some() && ref_file.is_some() { info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); + let ref_data = read_fastx_file(ref_file.as_ref() .unwrap()); - (kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.as_ref().unwrap()) + indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap())); + } else { panic!("Ambiguous reference, supply only one of `-r/--reference` and `-i/--index`"); }; @@ -163,30 +167,32 @@ fn main() { println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tin.contig"); let stdout = std::io::stdout(); query_files.par_iter().for_each(|file| { - - let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); - while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { - let contig = seqrec.id(); - let seq = seqrec.normalize(true); - - // Get local alignments for forward strand - let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); - - // Add local alignments for reverse _complement - run_lengths.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); - - // Sort by q.start - run_lengths.sort_by_key(|x| x.0); - - // Print results with query and ref name added - run_lengths.iter().for_each(|x| { - let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", - file, ref_name, x.0, x.1, x.2, x.3, x.4, std::str::from_utf8(contig).expect("UTF-8")); - }); - } + indexes.iter().for_each(|((sbwt, lcs), ref_name)| { + let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); + while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { + let contig = seqrec.id(); + let seq = seqrec.normalize(true); + + // Get local alignments for forward strand + let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + + // Add local alignments for reverse _complement + run_lengths.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); + + // Sort by q.start + run_lengths.sort_by_key(|x| x.0); + + // Print results with query and ref name added + run_lengths.iter().for_each(|x| { + let _ = writeln!(&mut stdout.lock(), + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + file, ref_name, x.0, x.1, x.2, x.3, x.4, std::str::from_utf8(contig).expect("UTF-8")); + }); + } + }); }); }, + Some(cli::Commands::Map { query_files, ref_file, From 61c8b2510a300c6f48c2dd35454a210383f4e079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 15:58:19 +0200 Subject: [PATCH 09/44] Add --detailed to process ref contigs separately. --- src/cli/cli.rs | 3 +++ src/cli/main.rs | 27 +++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/cli/cli.rs b/src/cli/cli.rs index ea01cee..dfc826b 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -75,6 +75,9 @@ pub enum Commands { // // ... or a prebuilt index #[arg(short = 'i', long = "index", group = "reference", help_heading = "Input")] index_prefix: Option, + // // Concatenate contigs in reference + #[arg(long = "detailed", help_heading = "Input", default_value_t = false)] + detailed: bool, // Parameters // // Upper bound for random match probability diff --git a/src/cli/main.rs b/src/cli/main.rs index 29546c4..fd076df 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -121,6 +121,7 @@ fn main() { query_files, ref_file, index_prefix, + detailed, max_error_prob, num_threads, kmer_size, @@ -150,8 +151,18 @@ fn main() { } else if !index_prefix.is_some() && ref_file.is_some() { info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); - let ref_data = read_fastx_file(ref_file.as_ref() .unwrap()); - indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap())); + if !*detailed { + let ref_data = read_fastx_file(ref_file.as_ref().unwrap()); + indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap())); + } else { + let mut reader = needletail::parse_fastx_file(ref_file.as_ref().unwrap()).expect("valid path/file"); + while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { + let contig = seqrec.id(); + let contig_name = std::str::from_utf8(contig).expect("UTF-8"); + let seq = seqrec.normalize(true); + indexes.push((kbo::index::build_sbwt_from_vecs(&[seq.to_vec()], &Some(sbwt_build_options.clone())), contig_name.to_string())); + } + } } else { panic!("Ambiguous reference, supply only one of `-r/--reference` and `-i/--index`"); @@ -164,13 +175,13 @@ fn main() { .unwrap(); info!("Querying SBWT index..."); - println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tin.contig"); + println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tquery.contig\tref.contig"); let stdout = std::io::stdout(); - query_files.par_iter().for_each(|file| { - indexes.iter().for_each(|((sbwt, lcs), ref_name)| { + query_files.iter().for_each(|file| { + indexes.par_iter().for_each(|((sbwt, lcs), ref_contig)| { let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { - let contig = seqrec.id(); + let query_contig = seqrec.id(); let seq = seqrec.normalize(true); // Get local alignments for forward strand @@ -185,8 +196,8 @@ fn main() { // Print results with query and ref name added run_lengths.iter().for_each(|x| { let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", - file, ref_name, x.0, x.1, x.2, x.3, x.4, std::str::from_utf8(contig).expect("UTF-8")); + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + file, ref_file.clone().unwrap(), x.0, x.1, x.2, x.3, x.4, ref_contig, std::str::from_utf8(query_contig).expect("UTF-8")); }); } }); From def041682b629938498a0a2f783218791001f3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 16:21:41 +0200 Subject: [PATCH 10/44] Process everything before printing in Find. --- src/cli/main.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index fd076df..06d176e 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -178,28 +178,30 @@ fn main() { println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tquery.contig\tref.contig"); let stdout = std::io::stdout(); query_files.iter().for_each(|file| { - indexes.par_iter().for_each(|((sbwt, lcs), ref_contig)| { + let mut run_lengths: Vec<(usize, usize, char, usize, usize, String, String)> = indexes.par_iter().map(|((sbwt, lcs), ref_contig)| { let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); + let mut res: Vec<(usize, usize, char, usize, usize, String, String)> = Vec::new(); while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { - let query_contig = seqrec.id(); + let query_contig = std::str::from_utf8(seqrec.id()).expect("UTF-8"); let seq = seqrec.normalize(true); // Get local alignments for forward strand - let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + res = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3, ref_contig.clone(), query_contig.to_string().clone())).collect(); // Add local alignments for reverse _complement - run_lengths.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); + res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3, ref_contig.clone(), query_contig.to_string().clone())).collect()); + } + res + }).flatten().collect(); - // Sort by q.start - run_lengths.sort_by_key(|x| x.0); + // Sort by q.start + run_lengths.sort_by_key(|x| x.0); - // Print results with query and ref name added - run_lengths.iter().for_each(|x| { - let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", - file, ref_file.clone().unwrap(), x.0, x.1, x.2, x.3, x.4, ref_contig, std::str::from_utf8(query_contig).expect("UTF-8")); - }); - } + // Print results with query and ref name added + run_lengths.iter().for_each(|x| { + let _ = writeln!(&mut stdout.lock(), + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + file, ref_file.clone().unwrap(), x.0, x.1, x.2, x.3, x.4, x.5, x.6); }); }); }, From 55ecadbf062f1b440c608590aea5bac5b0498946 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 1 Nov 2024 16:26:48 +0200 Subject: [PATCH 11/44] Add --min-len to filter short alignmentes. --- src/cli/cli.rs | 4 ++++ src/cli/main.rs | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cli/cli.rs b/src/cli/cli.rs index dfc826b..80a46fc 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -80,6 +80,10 @@ pub enum Commands { detailed: bool, // Parameters + // // Minimum length to report an alignment + #[arg(long = "min-len", default_value_t = 100, help_heading = "Algorithm")] + min_len: u64, + // // Upper bound for random match probability #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] max_error_prob: f64, diff --git a/src/cli/main.rs b/src/cli/main.rs index 06d176e..d9027aa 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -122,6 +122,7 @@ fn main() { ref_file, index_prefix, detailed, + min_len, max_error_prob, num_threads, kmer_size, @@ -198,7 +199,7 @@ fn main() { run_lengths.sort_by_key(|x| x.0); // Print results with query and ref name added - run_lengths.iter().for_each(|x| { + run_lengths.iter().filter(|x| x.1 - x.0 + 1 >= *min_len as usize).for_each(|x| { let _ = writeln!(&mut stdout.lock(), "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", file, ref_file.clone().unwrap(), x.0, x.1, x.2, x.3, x.4, x.5, x.6); From 1dee6f0ab36db2f9b44278e5f4a5a3faea757b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sat, 2 Nov 2024 10:41:13 +0200 Subject: [PATCH 12/44] Update needletail to v0.6.0, enable gz compression --- Cargo.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d9c30a7..43150bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,9 +28,7 @@ sbwt = "0.3.1" clap = { version = "4", features = ["derive"], optional = true} ### io -#### TODO Re-enable reading compressed sequences in needletail -##### This requires resolving the libllzma linker issue in build_artifacts.yml -needletail = { version = "0.5.1", default-features = false, optional = true } +needletail = { version = "0.6.0", default-features = false, features = ["flate2"], optional = true } ### logging log = { version = "0.4.20", optional = true } From 2859d0abd8a19af668817b468455e2ac8c7ef7fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 15:44:46 +0200 Subject: [PATCH 13/44] Fix test (see #31). --- src/format.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/format.rs b/src/format.rs index cac9984..2429664 100644 --- a/src/format.rs +++ b/src/format.rs @@ -128,8 +128,13 @@ pub fn run_lengths( /// ## Format a refined translation /// ```rust /// use kbo::format::relative_to_ref; +/// +/// // Ref sequence : A,A,A,G,A,A,C,C,A, T,C,A,G,G,G,C,G +/// // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A,G,G,G,-,- +/// // Translation : C,M,M,R,-,-,R,M,M, ,M,M,M,M,M,M,-,- +/// /// let reference: Vec = vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']; -/// let refined: Vec = vec!['C','M','M','R','-','-','R','M','M','C','M','M','M','M','M','M','-','-']; +/// let refined: Vec = vec!['C','M','M','R','-','-','R','M','M','M','M','M','M','M','M','-','-']; /// /// let relative = relative_to_ref(&reference, &refined); /// # let expected = vec![b'C',b'A',b'A',b'G',b'-',b'-',b'C',b'C',b'A',b'T',b'C',b'A',b'G',b'G',b'G',b'-',b'-']; From 3ae23ed9e8056734ce097d24d5a9e09b13754c96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 16:04:40 +0200 Subject: [PATCH 14/44] Return run_lengths as new RLE struct. --- src/format.rs | 101 +++++++++++++++++++++++++++++++++++++++++++++++--- src/lib.rs | 5 ++- 2 files changed, 98 insertions(+), 8 deletions(-) diff --git a/src/format.rs b/src/format.rs index 2429664..b76da8e 100644 --- a/src/format.rs +++ b/src/format.rs @@ -13,6 +13,53 @@ // //! Converting alignment representations into various output formats. +/// Run length encoding for an alignment segment +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct RLE { + /// Start position (1-based indexing) + pub start: usize, + /// End position (1-based indexing) + pub end: usize, + /// Number of matching bases ('M' or 'R') + pub matches: usize, + /// Number of mismatching bases (not 'M', 'R', or '-') + pub mismatches: usize, + /// Number of _k_-mer jumps (count of double 'R's) + pub jumps: usize, + /// Total number of missing bases ('-') + pub gap_bases: usize, + /// Number of consecutive '-' runs in segment regardless of length + pub gap_opens: usize, +} + +impl Default for RLE { + /// Default to these values: + /// ```rust + /// let mut opts = kbo::format::RLE::default(); + /// opts.start = 0; + /// opts.end = 0; + /// opts.matches = 0; + /// opts.mismatches = 0; + /// opts.jumps = 0; + /// opts.gap_bases = 0; + /// opts.gap_opens = 0; + /// # let expected = kbo::format::RLE::default(); + /// # assert_eq!(opts, expected); + /// ``` + /// + fn default() -> RLE { + RLE { + start: 0, + end: 0, + matches: 0, + mismatches: 0, + jumps: 0, + gap_bases: 0, + gap_opens: 0, + } + } +} + /// Extracts run length encodings from a translated alignment. /// /// Traverses the character representation of the alignment stored in `aln` and @@ -33,6 +80,7 @@ /// ## Extract run lengths from a character representation /// ```rust /// use kbo::format::run_lengths; +/// use kbo::format::RLE; /// /// // Parameters : k = 3, threshold = 2 /// // @@ -46,28 +94,39 @@ /// /// let input: Vec = vec!['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-']; /// let run_lengths = run_lengths(&input); -/// # let expected = vec![(1,11,9,2),(14,16,3,0)]; +/// # let expected = vec![RLE{start: 1, end: 11, matches: 9, mismatches: 2, jumps: 1, gap_bases: 0, gap_opens: 0}, +/// # RLE{start: 14, end: 16, matches: 3, mismatches : 0, jumps : 0, gap_bases : 0, gap_opens : 0}]; /// # assert_eq!(run_lengths, expected); /// ``` /// pub fn run_lengths( aln: &[char], -) -> Vec<(usize, usize, usize, usize)> { - // Store run lengths as Vec<(start, end, matches, mismatches)> - let mut encodings: Vec<(usize, usize, usize, usize)> = Vec::new(); +) -> Vec { + let mut encodings: Vec = Vec::new(); let mut i = 0; let mut match_start: bool = false; while i < aln.len() { match_start = (aln[i] != '-' && aln[i] != ' ') && !match_start; + let mut jumps = 0; if match_start { let start = i; let mut matches: usize = 0; while i < aln.len() && (aln[i] != '-' && aln[i] != ' ') { matches += (aln[i] == 'M' || aln[i] == 'R') as usize; + jumps += (aln[i] == 'R') as usize; i += 1; } - encodings.push((start + 1, i, matches, i - start - matches)); + let rle: RLE = RLE{ + start: start + 1, + end: i, + matches, + mismatches: i - start - matches, + jumps: jumps / 2, + gap_bases: 0, + gap_opens: 0, + }; + encodings.push(rle); match_start = false; } else { i += 1; @@ -168,7 +227,37 @@ pub fn relative_to_ref( mod tests { #[test] fn run_lengths() { - let expected: Vec<(usize, usize, usize, usize)> = vec![(6,33,28,0),(82,207,126,0),(373,423,51,0),(488,512,25,0)]; + use crate::format::RLE; + + let expected: Vec = vec![ + RLE{start: 6, + end: 33, + matches: 28, + mismatches : 0, + jumps : 0, + gap_bases: 0, + gap_opens: 0}, + RLE{start: 82, + end: 207, + matches: 126, + mismatches: 0, + jumps: 0, + gap_bases: 0, + gap_opens: 0}, + RLE{start: 373, + end: 423, + matches: 51, + mismatches: 0, + jumps: 0, + gap_bases: 0, + gap_opens: 0}, + RLE{start: 488, + end: 512, + matches: 25, + mismatches: 0, + jumps: 0, + gap_bases: 0, + gap_opens: 0}]; let input = vec!['-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; let got = super::run_lengths(&input); assert_eq!(got, expected); diff --git a/src/lib.rs b/src/lib.rs index 13b8ed1..a0c1219 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -259,6 +259,7 @@ pub fn map( /// use kbo::find; /// use kbo::index::BuildOpts; /// use kbo::derandomize::DerandomizeOpts; +/// use kbo::format::RLE; /// /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -269,7 +270,7 @@ pub fn map( /// /// let local_alignments = find(&query, &sbwt, &lcs, DerandomizeOpts::default()); /// // `local_alignments` has [(10, 12, 3, 0)] -/// # assert_eq!(local_alignments, vec![(10, 12, 3, 0)]); +/// # assert_eq!(local_alignments, vec![RLE{start: 10, end: 12, matches: 3, mismatches: 0, jumps: 0, gap_bases: 0, gap_opens: 0}]); /// ``` /// pub fn find( @@ -277,7 +278,7 @@ pub fn find( sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, derand_opts: derandomize::DerandomizeOpts, -) -> Vec<(usize, usize, usize, usize)> { +) -> Vec { let aln = matches(query_seq, sbwt, lcs, derand_opts); format::run_lengths(&aln) } From 3daea1bebafa3ac61657711cd57270fcbb4dd672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 16:09:08 +0200 Subject: [PATCH 15/44] Use the new RLE return type in find. --- src/cli/main.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index d9027aa..2e5fb38 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -187,10 +187,16 @@ fn main() { let seq = seqrec.normalize(true); // Get local alignments for forward strand - res = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3, ref_contig.clone(), query_contig.to_string().clone())).collect(); + res = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()) + .iter() + .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, + ref_contig.clone(), query_contig.to_string().clone())).collect(); // Add local alignments for reverse _complement - res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3, ref_contig.clone(), query_contig.to_string().clone())).collect()); + res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()) + .iter() + .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, + ref_contig.clone(), query_contig.to_string().clone())).collect()); } res }).flatten().collect(); From 0e360017bc8185386fed88e9548d1ab3a1c61467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 21:22:45 +0200 Subject: [PATCH 16/44] Update run_lengths return type documentation. --- src/format.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/format.rs b/src/format.rs index b76da8e..43b92df 100644 --- a/src/format.rs +++ b/src/format.rs @@ -68,11 +68,7 @@ impl Default for RLE { /// /// This function can be used for both plain and refined translations. /// -/// Returns a vector where each element contains 4 elements with the contents: -/// - Start position of a run of consecutive characters (indexing starts from 1). -/// - End position of a run of consecutive characters (indexing starts from 1). -/// - Number matching bases in the run (character representation is 'M' or 'R'). -/// - Number mismatching bases in the run (character representation is not 'M', '-', or ' '). +/// Returns a vector of [Run Length Encodings (RLE)](RLE) structs. /// /// The run length is the sum of the matching and mismatching bases. /// From e702fccc575aaa1690649c38356e355054df6a8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 21:22:57 +0200 Subject: [PATCH 17/44] Add function to extract gapped run length encoding --- src/format.rs | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/src/format.rs b/src/format.rs index 43b92df..996f4cd 100644 --- a/src/format.rs +++ b/src/format.rs @@ -131,6 +131,102 @@ pub fn run_lengths( encodings } +/// Extracts run length encodings while allowing some gaps. +/// +/// Traverses the character representation of the alignment stored in `aln` and +/// counts the consecutive run lengths of sections that align to the reference. +/// A base is counted as aligned if its character representation is not '-' or ' '. +/// +/// Compared to [run_lengths], gapped_run_lengths allows `max_gap_opens` gapped +/// segments (consecutive '-'s) within an alignment block. The gapped segments +/// can be at most `max_gap_len` bases long before the alignment is broken. +/// +/// This function can be used for both plain and refined translations. +/// +/// Returns a vector of [Run Length Encodings (RLE)](RLE) structs. +/// +/// The run length is the sum of the matching and mismatching bases. +/// +/// # Examples +/// ## Extract run lengths from a character representation +/// ```rust +/// use kbo::format::run_lengths_gapped; +/// use kbo::format::RLE; +/// +/// // Parameters : k = 3, threshold = 2 +/// // Parameters : max_gaps = 3, max_gap_len = 3 +/// // +/// // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G +/// // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C +/// // Input MS : 0,1,2,3, 1,2,3,0,1,2,3,-1,0,1,2,3,-1,0 +/// // Translation : X,M,M,R, , ,R,M,M,X,M,M,M, , ,M,M,M, -,- +/// /// +/// // Expected output: [(1, 11, 9, 2), +/// // (14, 16, 3, 0)] +/// +/// let input: Vec = vec!['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-']; +/// let run_lengths = run_lengths_gapped(&input, 3, 3); +/// # let expected = vec![RLE{start: 1, end: 18, matches: 12, mismatches: 2, jumps: 1, gap_bases: 4, gap_opens: 2}]; +/// # assert_eq!(run_lengths, expected); +/// ``` +/// +pub fn run_lengths_gapped( + aln: &[char], + max_gaps: usize, + max_gap_len: usize, +) -> Vec { + let mut encodings: Vec = Vec::new(); + + let mut i = 0; + let mut match_start: bool = false; + while i < aln.len() { + match_start = (aln[i] != '-' && aln[i] != ' ') && !match_start; + let mut jumps = 0; + if match_start { + let mut current_gap_bases = 0; + let mut total_gap_bases = 0; + let mut gap_opens = 0; + let mut gap_start = false; + let start = i; + let mut matches: usize = 0; + while i < aln.len() && (gap_opens <= max_gaps && aln[i] != ' ') { + if aln[i] == '-' && !gap_start { + gap_start = true; + gap_opens += 1; + current_gap_bases = 0; + } + if aln[i] != '-' && gap_start { + gap_start = false; + } + total_gap_bases += (aln[i] == '-') as usize; + current_gap_bases += (aln[i] == '-') as usize; + if current_gap_bases > max_gap_len { + break + } + matches += (aln[i] == 'M' || aln[i] == 'R') as usize; + jumps += (aln[i] == 'R') as usize; + i += 1; + } + if current_gap_bases <= max_gap_len { + let rle: RLE = RLE{ + start: start + 1, + end: i, + matches, + mismatches: i - start - matches - total_gap_bases, + jumps: jumps / 2, + gap_bases: total_gap_bases, + gap_opens, + }; + encodings.push(rle); + } + match_start = false; + } else { + i += 1; + } + } + encodings +} + /// Format a refined translation relative to the reference sequence. /// /// Jointly reads nucleotides from the reference sequence `ref_seq` and the From 3d9a57cd865ccf924703ffe107d0463e38a7e22d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 22:09:56 +0200 Subject: [PATCH 18/44] Add gapped run length encodings to find(). --- src/cli/main.rs | 8 ++++---- src/lib.rs | 51 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 2e5fb38..0f197c4 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -141,8 +141,8 @@ fn main() { sbwt_build_options.mem_gb = *mem_gb; sbwt_build_options.temp_dir = temp_dir.clone(); - let mut derand_opts = kbo::derandomize::DerandomizeOpts::default(); - derand_opts.max_error_prob = *max_error_prob; + let mut find_opts = kbo::FindOpts::default(); + find_opts.max_error_prob = *max_error_prob; let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String)> = Vec::new(); @@ -187,13 +187,13 @@ fn main() { let seq = seqrec.normalize(true); // Get local alignments for forward strand - res = kbo::find(&seq, &sbwt, &lcs, derand_opts.clone()) + res = kbo::find(&seq, &sbwt, &lcs, find_opts.clone()) .iter() .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, ref_contig.clone(), query_contig.to_string().clone())).collect(); // Add local alignments for reverse _complement - res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()) + res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, find_opts.clone()) .iter() .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, ref_contig.clone(), query_contig.to_string().clone())).collect()); diff --git a/src/lib.rs b/src/lib.rs index a0c1219..44b605e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,6 +94,40 @@ pub mod format; pub mod index; pub mod translate; +/// Options and parameters for [Find](find) +#[non_exhaustive] +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct FindOpts { + /// Prefix match lengths with probability higher than `max_error_prob` to + /// happen at random are considered noise. + pub max_error_prob: f64, + /// Maximum number of gap segments (gap opens) allowed before splitting an + /// alignment. + pub max_gaps: usize, + /// Maximum length of a single gap segment before splitting an alignment. + pub max_gap_len: usize, +} + +impl Default for FindOpts { + /// Default to these values: + /// ```rust + /// let mut opts = kbo::FindOpts::default(); + /// opts.max_error_prob = 0.0000001; + /// opts.max_gaps = 0; + /// opts.max_gap_len = 0; + /// # let expected = kbo::FindOpts::default(); + /// # assert_eq!(opts, expected); + /// ``` + /// + fn default() -> FindOpts { + FindOpts { + max_error_prob: 0.0000001, + max_gaps: 0, + max_gap_len: 0, + } + } +} + /// Builds an SBWT index from some fasta or fastq files. /// /// Reads all sequence data in `seq_files` and builds an SBWT index @@ -254,11 +288,14 @@ pub fn map( /// 4. Number of mismatches and 1-character insertions in the block. /// /// # Examples +/// +/// TODO Add better examples to find() +/// /// ```rust /// use kbo::build; /// use kbo::find; +/// use kbo::FindOpts; /// use kbo::index::BuildOpts; -/// use kbo::derandomize::DerandomizeOpts; /// use kbo::format::RLE; /// /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; @@ -268,7 +305,7 @@ pub fn map( /// /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let local_alignments = find(&query, &sbwt, &lcs, DerandomizeOpts::default()); +/// let local_alignments = find(&query, &sbwt, &lcs, FindOpts::default()); /// // `local_alignments` has [(10, 12, 3, 0)] /// # assert_eq!(local_alignments, vec![RLE{start: 10, end: 12, matches: 3, mismatches: 0, jumps: 0, gap_bases: 0, gap_opens: 0}]); /// ``` @@ -277,8 +314,14 @@ pub fn find( query_seq: &[u8], sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, - derand_opts: derandomize::DerandomizeOpts, + find_opts: FindOpts, ) -> Vec { + let mut derand_opts = derandomize::DerandomizeOpts::default(); + derand_opts.max_error_prob = find_opts.max_error_prob.clone(); let aln = matches(query_seq, sbwt, lcs, derand_opts); - format::run_lengths(&aln) + if find_opts.max_gaps > 0 || find_opts.max_gap_len > 0 { + format::run_lengths_gapped(&aln, find_opts.max_gaps, find_opts.max_gap_len) + } else { + format::run_lengths(&aln) + } } From d064a8eb5a34763435b4217621dbe987b4c8d147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 22:24:11 +0200 Subject: [PATCH 19/44] Move the derandomize opts to lib.rs, differentiate --- src/cli/main.rs | 6 ++-- src/derandomize.rs | 26 ---------------- src/lib.rs | 74 ++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 65 insertions(+), 41 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 0f197c4..ce12140 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -238,8 +238,8 @@ fn main() { sbwt_build_options.mem_gb = *mem_gb; sbwt_build_options.temp_dir = temp_dir.clone(); - let mut derand_opts = kbo::derandomize::DerandomizeOpts::default(); - derand_opts.max_error_prob = *max_error_prob; + let mut map_opts = kbo::MapOpts::default(); + map_opts.max_error_prob = *max_error_prob; rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) @@ -256,7 +256,7 @@ fn main() { let mut res: Vec = Vec::new(); ref_data.iter().for_each(|ref_contig| { - res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, derand_opts.clone())); + res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts.clone())); }); let _ = writeln!(&mut stdout.lock(), diff --git a/src/derandomize.rs b/src/derandomize.rs index fd0040d..ef17d4e 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -15,32 +15,6 @@ use embed_doc_image::embed_doc_image; -/// Controls parameters of the derandomization algorithm. -/// -#[derive(Clone, Debug)] -#[non_exhaustive] -pub struct DerandomizeOpts { - /// Prefix match lengths with probability higher than `max_error_prob` to - /// happen at random are considered noise. - pub max_error_prob: f64, -} - -impl Default for DerandomizeOpts { - /// Default to these values: - /// ```rust - /// let mut opts = kbo::derandomize::DerandomizeOpts::default(); - /// opts.max_error_prob = 0.0000001; - /// # let expected = kbo::derandomize::DerandomizeOpts::default(); - /// # assert_eq!(opts.max_error_prob, expected.max_error_prob); - /// ``` - /// - fn default() -> DerandomizeOpts { - DerandomizeOpts { - max_error_prob: 0.0000001, - } - } -} - /// Evaluates the CDF of _k_-bounded matching statistics random match distribution. /// /// Computes the log-probability that a matching statistic with value diff --git a/src/lib.rs b/src/lib.rs index 44b605e..6e37138 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -94,7 +94,7 @@ pub mod format; pub mod index; pub mod translate; -/// Options and parameters for [Find](find) +/// Options and parameters for [find] #[non_exhaustive] #[derive(Copy, Clone, Debug, PartialEq)] pub struct FindOpts { @@ -128,6 +128,56 @@ impl Default for FindOpts { } } +/// Options and parameters for [matches](matches()) +#[non_exhaustive] +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct MatchOpts { + /// Prefix match lengths with probability higher than `max_error_prob` to + /// happen at random are considered noise. + pub max_error_prob: f64, +} + +impl Default for MatchOpts { + /// Default to these values: + /// ```rust + /// let mut opts = kbo::MatchOpts::default(); + /// opts.max_error_prob = 0.0000001; + /// # let expected = kbo::MatchOpts::default(); + /// # assert_eq!(opts, expected); + /// ``` + /// + fn default() -> MatchOpts { + MatchOpts { + max_error_prob: 0.0000001, + } + } +} + +/// Options and parameters for [map] +#[non_exhaustive] +#[derive(Copy, Clone, Debug, PartialEq)] +pub struct MapOpts { + /// Prefix match lengths with probability higher than `max_error_prob` to + /// happen at random are considered noise. + pub max_error_prob: f64, +} + +impl Default for MapOpts { + /// Default to these values: + /// ```rust + /// let mut opts = kbo::MapOpts::default(); + /// opts.max_error_prob = 0.0000001; + /// # let expected = kbo::MapOpts::default(); + /// # assert_eq!(opts, expected); + /// ``` + /// + fn default() -> MapOpts { + MapOpts { + max_error_prob: 0.0000001, + } + } +} + /// Builds an SBWT index from some fasta or fastq files. /// /// Reads all sequence data in `seq_files` and builds an SBWT index @@ -191,7 +241,7 @@ pub fn build( /// use kbo::build; /// use kbo::matches; /// use kbo::index::BuildOpts; -/// use kbo::derandomize::DerandomizeOpts; +/// use kbo::MatchOpts; /// /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -200,7 +250,7 @@ pub fn build( /// /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let ms_vectors = matches(&query, &sbwt, &lcs, DerandomizeOpts::default()); +/// let ms_vectors = matches(&query, &sbwt, &lcs, MatchOpts::default()); /// // `ms_vectors` has ['-','-','-','-','-','-','-','-','-','M','M','M','-','-'] /// # assert_eq!(ms_vectors, vec!['-','-','-','-','-','-','-','-','-','M','M','M','-','-']); /// ``` @@ -209,11 +259,11 @@ pub fn matches( query_seq: &[u8], sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, - derand_opts: derandomize::DerandomizeOpts, + match_opts: MatchOpts, ) -> Vec { let (k, threshold) = match sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, derand_opts.max_error_prob)) + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, match_opts.max_error_prob)) }, }; @@ -237,7 +287,7 @@ pub fn matches( /// use kbo::build; /// use kbo::map; /// use kbo::index::BuildOpts; -/// use kbo::derandomize::DerandomizeOpts; +/// use kbo::MapOpts; /// /// let query: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -247,7 +297,7 @@ pub fn matches( /// /// let reference = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let alignment = map(&reference, &sbwt_query, &lcs_query, DerandomizeOpts::default()); +/// let alignment = map(&reference, &sbwt_query, &lcs_query, MapOpts::default()); /// // `ms_vectors` has [45,45,45,45,45,45,45,45,45,65,71,71,45,45] /// # assert_eq!(alignment, vec![45,45,45,45,45,45,45,45,45,65,71,71,45,45]); /// ``` @@ -256,11 +306,11 @@ pub fn map( ref_seq: &[u8], query_sbwt: &SbwtIndexVariant, query_lcs: &sbwt::LcsArray, - derand_opts: derandomize::DerandomizeOpts, + map_opts: MapOpts, ) -> Vec { let (k, threshold) = match query_sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, derand_opts.max_error_prob)) + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, map_opts.max_error_prob)) }, }; @@ -316,9 +366,9 @@ pub fn find( lcs: &sbwt::LcsArray, find_opts: FindOpts, ) -> Vec { - let mut derand_opts = derandomize::DerandomizeOpts::default(); - derand_opts.max_error_prob = find_opts.max_error_prob.clone(); - let aln = matches(query_seq, sbwt, lcs, derand_opts); + let mut match_opts = MatchOpts::default(); + match_opts.max_error_prob = find_opts.max_error_prob.clone(); + let aln = matches(query_seq, sbwt, lcs, match_opts); if find_opts.max_gaps > 0 || find_opts.max_gap_len > 0 { format::run_lengths_gapped(&aln, find_opts.max_gaps, find_opts.max_gap_len) } else { From 984e140190dcf923fd6e679243bbdf8a3ad10466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 4 Nov 2024 22:47:37 +0200 Subject: [PATCH 20/44] Add gapped alignment to CLI. --- src/cli/cli.rs | 4 ++++ src/cli/main.rs | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/cli/cli.rs b/src/cli/cli.rs index 80a46fc..957edf3 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -83,6 +83,10 @@ pub enum Commands { // // Minimum length to report an alignment #[arg(long = "min-len", default_value_t = 100, help_heading = "Algorithm")] min_len: u64, + #[arg(long = "max-gaps", default_value_t = 0, help_heading = "Algorithm")] + max_gaps: u64, + #[arg(long = "max-gap-len", default_value_t = 0, help_heading = "Algorithm")] + max_gap_len: u64, // // Upper bound for random match probability #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] diff --git a/src/cli/main.rs b/src/cli/main.rs index ce12140..26cbf7f 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -123,6 +123,8 @@ fn main() { index_prefix, detailed, min_len, + max_gap_len, + max_gaps, max_error_prob, num_threads, kmer_size, @@ -143,6 +145,8 @@ fn main() { let mut find_opts = kbo::FindOpts::default(); find_opts.max_error_prob = *max_error_prob; + find_opts.max_gap_len = *max_gap_len as usize; + find_opts.max_gaps = *max_gaps as usize; let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String)> = Vec::new(); From d839eee0e5b8a325b44b9213c6c5ff4629be8069 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 5 Nov 2024 11:05:31 +0200 Subject: [PATCH 21/44] Tabs -> spaces. --- src/lib.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6e37138..b9beafd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -262,9 +262,9 @@ pub fn matches( match_opts: MatchOpts, ) -> Vec { let (k, threshold) = match sbwt { - SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, match_opts.max_error_prob)) - }, + SbwtIndexVariant::SubsetMatrix(ref sbwt) => { + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, match_opts.max_error_prob)) + }, }; let noisy_ms: Vec = index::query_sbwt(query_seq, sbwt, lcs).iter().map(|x| x.0).collect(); @@ -309,9 +309,9 @@ pub fn map( map_opts: MapOpts, ) -> Vec { let (k, threshold) = match query_sbwt { - SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, map_opts.max_error_prob)) - }, + SbwtIndexVariant::SubsetMatrix(ref sbwt) => { + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, map_opts.max_error_prob)) + }, }; let noisy_ms = index::query_sbwt(ref_seq, query_sbwt, query_lcs); From cfe0ea964ca862f0ef1468ed5e2e47ff33b9bca1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 5 Nov 2024 11:38:15 +0200 Subject: [PATCH 22/44] Document 2/3 use cases for find. --- src/lib.rs | 105 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b9beafd..2fd61fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,14 +19,14 @@ //! Currently, kbo supports two main operations: //! //! - `kbo find` [matches](matches()) the _k_-mers in a query sequence with the -//! reference and reports the local alignment segments found within the -//! reference. Find is useful for problems that can be solved with -//! [blast](https://blast.ncbi.nlm.nih.gov/Blast.cgi). +//! reference and reports the local alignment segments found within the +//! reference. Find is useful for problems that can be solved with +//! [blast](https://blast.ncbi.nlm.nih.gov/Blast.cgi). //! - `kbo map` [maps](map()) the query sequence against a reference -//! sequence, and reports the nucleotide sequence of the alignment relative to -//! the reference. Map solves the same problem as -//! [snippy](https://github.com/tseemann/snippy) and [ska -//! map](https://docs.rs/ska/latest/ska/#ska-map). +//! sequence, and reports the nucleotide sequence of the alignment relative to +//! the reference. Map solves the same problem as +//! [snippy](https://github.com/tseemann/snippy) and [ska +//! map](https://docs.rs/ska/latest/ska/#ska-map). //! //! kbo uses the [Spectral Burrows-Wheeler //! Transform](https://docs.rs/sbwt/latest/sbwt/) data structure that allows @@ -43,6 +43,10 @@ //! relevant in `kbo find` analyses where the reference _k_-mers can be //! concatenated into a single contig. //! +//! kbo can read inputs compressed in the DEFLATE format (gzip, zlib, etc.). +//! bzip2 and xz support can be enabled by adding the "bzip2" and "xz" feature +//! flags to [needletail](https://docs.rs/needletail) in the kbo Cargo.toml. +//! //! ## kbo find //! //! To set up the example, download the fasta sequence of the [_Escherichia @@ -58,14 +62,91 @@ //! ```text //! kbo find --reference db.fasta GCF_000714595.1_ASM71459v1_genomic.fna //! ``` -//! This will produce the output -//! +//!
+//! +//! This will produce the output (click to expand) +//! +//! +//! |query|ref|q.start|q.end|strand|length|mismatches|query.contig|ref.contig| +//! |-----|---|-------|-----|------|------|----------|------------|----------| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|226708|227226|+|519|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|2289596|2290543|+|949|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3152039|3161660|+|9623|1|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3161701|3164301|+|2601|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3164311|3165180|+|870|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3165210|3165458|+|249|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3165462|3167857|+|2397|1|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3167905|3172701|+|4797|2|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3172751|3175783|+|3033|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3175827|3182327|+|6501|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3182338|3190258|+|7922|1|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3190320|3196124|+|5807|1|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3196155|3198614|+|2460|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3198627|3200856|+|2231|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3200891|3201403|+|513|1|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|4502887|4503405|+|519|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5145962|5147210|+|1249|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5147272|5149449|+|2179|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5351015|5351533|+|519|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5352280|5352503|+|224|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5354674|5356713|+|2040|1|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5381795|5381945|+|151|0|db.fasta|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! +//! +//!
+//! +//! ### Find gene sequence locations with names +//! If you need to know which gene in db.fasta the matches are for, add the `--detailed` toggle: //! ```text -//! TODO add output +//! kbo find --detailed --reference db.fasta GCF_000714595.1_ASM71459v1_genomic.fna //! ``` //! -//! ### Find presence/absence of gene sequences -//! Alternatively, if you are only interested in containment of the `db.fasta` genes in the assembly, run +//!
+//! +//! This replaces the query.contig column with the name of the contig (click to expand) +//! +//! +//! |query|ref|q.start|q.end|strand|length|mismatches|query.contig|ref.contig| +//! |-----|---|-------|-----|------|------|----------|------------|----------| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|226708|227226|+|519|0|clbS-like_4ce09a|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|2289596|2289808|+|213|0|clbR locus_tag=ECOK1_RS11410 product="colibactin biosynthesis LuxR family transcriptional regulator ClbR" protein_id=WP_000357141.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|2289809|2290543|+|735|0|clbA locus_tag=ECOK1_RS11415 product="colibactin biosynthesis phosphopantetheinyl transferase ClbA" protein_id=WP_001217110.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3152039|3161660|+|9623|1|clbB locus_tag=ECOK1_RS11405 product="colibactin hybrid non-ribosomal peptide synthetase/type I polyketide synthase ClbB" protein_id=WP_001518711.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3161701|3164301|+|2601|0|clbC locus_tag=ECOK1_RS11400 product="colibactin polyketide synthase ClbC" protein_id=WP_001297908.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3164311|3165180|+|870|0|clbD locus_tag=ECOK1_RS11395 product="colibactin biosynthesis dehydrogenase ClbD" protein_id=WP_000982270.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3165210|3165458|+|249|0|clbE locus_tag=ECOK1_RS11390 product="colibactin biosynthesis aminomalonyl-acyl carrier protein ClbE" protein_id=WP_001297917.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3165462|3166592|+|1131|0|clbF locus_tag=ECOK1_RS11385 product="colibactin biosynthesis dehydrogenase ClbF" protein_id=WP_000337350.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3166589|3167857|+|1269|1|clbG locus_tag=ECOK1_RS11380 product="colibactin biosynthesis acyltransferase ClbG" protein_id=WP_000159201.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3167905|3172701|+|4797|2|clbH locus_tag=ECOK1_RS11375 product="colibactin non-ribosomal peptide synthetase ClbH" protein_id=WP_001304254.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3172751|3175783|+|3033|0|clbI locus_tag=ECOK1_RS11370 product="colibactin polyketide synthase ClbI" protein_id=WP_000829570.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3175827|3182327|+|6501|0|clbJ locus_tag=ECOK1_RS11365 product="colibactin non-ribosomal peptide synthetase ClbJ" protein_id=WP_001468003.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3180417|3181703|+|1287|2|clbK locus_tag=ECOK1_RS11360 product="colibactin hybrid non-ribosomal peptide synthetase/type I polyketide synthase ClbK" protein_id=WP_000222467.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3182338|3188802|+|6465|2|clbK locus_tag=ECOK1_RS11360 product="colibactin hybrid non-ribosomal peptide synthetase/type I polyketide synthase ClbK" protein_id=WP_000222467.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3186070|3187356|+|1287|1|clbJ locus_tag=ECOK1_RS11365 product="colibactin non-ribosomal peptide synthetase ClbJ" protein_id=WP_001468003.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3188795|3190258|+|1464|0|clbL locus_tag=ECOK1_RS11355 product="colibactin biosynthesis amidase ClbL" protein_id=WP_001297937.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3190320|3191759|+|1440|0|clbM locus_tag=ECOK1_RS11350 product="precolibactin export MATE transporter ClbM" protein_id=WP_000217768.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3191756|3196124|+|4370|1|clbN locus_tag=ECOK1_RS11345 product="colibactin non-ribosomal peptide synthetase ClbN" protein_id=WP_001327259.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3196155|3198614|+|2460|0|clbO locus_tag=ECOK1_RS11340 product="colibactin polyketide synthase ClbO" protein_id=WP_001029878.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3198627|3200141|+|1515|0|clbP locus_tag=ECOK1_RS11335 product="precolibactin peptidase ClbP" protein_id=WP_002430641.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3200134|3200856|+|723|0|clbQ locus_tag=ECOK1_RS11330 product="colibactin biosynthesis thioesterase ClbQ" protein_id=WP_000065646.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|3200891|3201403|+|513|1|clbS locus_tag=ECOK1_RS11325 product="colibactin self-protection protein ClbS" protein_id=WP_000290498.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|4502887|4503405|+|519|0|clbS-like_4ce09a|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5145962|5147210|+|1249|0|clbL locus_tag=ECOK1_RS11355 product="colibactin biosynthesis amidase ClbL" protein_id=WP_001297937.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5147272|5148479|+|1208|0|clbM locus_tag=ECOK1_RS11350 product="precolibactin export MATE transporter ClbM" protein_id=WP_000217768.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5148478|5149449|+|972|0|clbN locus_tag=ECOK1_RS11345 product="colibactin non-ribosomal peptide synthetase ClbN" protein_id=WP_001327259.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5351015|5351533|+|519|0|clbS-like_4ce09a|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5352280|5352503|+|224|0|clbS-like_4ce09a|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5354674|5356713|+|2040|1|clbN locus_tag=ECOK1_RS11345 product="colibactin non-ribosomal peptide synthetase ClbN" protein_id=WP_001327259.1|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! |GCF_000714595.1_ASM71459v1_genomic.fna|db.fasta|5381795|5381945|+|151|0|clbS-like_4ce09a|NZ_CP007799.1 Escherichia coli Nissle 1917 chromosome, complete genome| +//! +//!
+//! +//! Note that the current implementation `--detailed` significantly slows down +//! the algorithm. Future versions of kbo may address this by incorporating +//! colors in the index structure. +//! +//! ### Find containment of gene sequences in assembly +//! Alternatively, if you are only interested in whether the contigs in `db.fasta` are present in the assembly, run //! ```text //! kbo find --reference GCF_000714595.1_ASM71459v1_genomic.fna db.fasta //! ``` From 90a501686d4b700ee69dcec74e598c97aaf43e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 5 Nov 2024 13:00:27 +0200 Subject: [PATCH 23/44] Fix processing queries with multiple contigs. --- src/cli/main.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 26cbf7f..130bafb 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -189,12 +189,11 @@ fn main() { while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { let query_contig = std::str::from_utf8(seqrec.id()).expect("UTF-8"); let seq = seqrec.normalize(true); - // Get local alignments for forward strand - res = kbo::find(&seq, &sbwt, &lcs, find_opts.clone()) - .iter() - .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, - ref_contig.clone(), query_contig.to_string().clone())).collect(); + res.append(&mut kbo::find(&seq, &sbwt, &lcs, find_opts.clone()) + .iter() + .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, + ref_contig.clone(), query_contig.to_string().clone())).collect()); // Add local alignments for reverse _complement res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, find_opts.clone()) From 18e7a056cad6e79ecad2964f39847f5a89531305 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 5 Nov 2024 13:05:26 +0200 Subject: [PATCH 24/44] Add the last example to find. --- src/lib.rs | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2fd61fc..b9f6c06 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -150,15 +150,45 @@ //! ```text //! kbo find --reference GCF_000714595.1_ASM71459v1_genomic.fna db.fasta //! ``` -//! which will return -//! ```text -//! TODO add output -//! ```text +//! +//!
+//! +//! which will return (click to expand) +//! +//! +//! |query|ref|q.start|q.end|strand|length|mismatches|query.contig|ref.contig| +//! |-----|---|-------|-----|------|------|----------|------------|----------| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|513|+|513|1|GCF_000714595.1_ASM71459v1_genomic.fna|clbS\|locus_tag=ECOK1_RS11325\|product="colibactin self-protection protein ClbS"\|protein_id=WP_000290498.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|723|+|723|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbQ\|locus_tag=ECOK1_RS11330\|product="colibactin biosynthesis thioesterase ClbQ"\|protein_id=WP_000065646.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|1515|+|1515|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbP\|locus_tag=ECOK1_RS11335\|product="precolibactin peptidase ClbP"\|protein_id=WP_002430641.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|2460|+|2460|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbO\|locus_tag=ECOK1_RS11340\|product="colibactin polyketide synthase ClbO"\|protein_id=WP_001029878.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|4368|+|4369|1|GCF_000714595.1_ASM71459v1_genomic.fna|clbN\|locus_tag=ECOK1_RS11345\|product="colibactin non-ribosomal peptide synthetase ClbN"\|protein_id=WP_001327259.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|1208|+|1208|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbM\|locus_tag=ECOK1_RS11350\|product="precolibactin export MATE transporter ClbM"\|protein_id=WP_000217768.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|1440|+|1440|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbM\|locus_tag=ECOK1_RS11350\|product="precolibactin export MATE transporter ClbM"\|protein_id=WP_000217768.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|1464|+|1464|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbL\|locus_tag=ECOK1_RS11355\|product="colibactin biosynthesis amidase ClbL"\|protein_id=WP_001297937.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|6465|+|6465|2|GCF_000714595.1_ASM71459v1_genomic.fna|clbK\|locus_tag=ECOK1_RS11360\|product="colibactin hybrid non-ribosomal peptide synthetase/type I polyketide synthase ClbK"\|protein_id=WP_000222467.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|6501|+|6501|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbJ\|locus_tag=ECOK1_RS11365\|product="colibactin non-ribosomal peptide synthetase ClbJ"\|protein_id=WP_001468003.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|3033|+|3033|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbI\|locus_tag=ECOK1_RS11370\|product="colibactin polyketide synthase ClbI"\|protein_id=WP_000829570.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|4797|+|4797|2|GCF_000714595.1_ASM71459v1_genomic.fna|clbH\|locus_tag=ECOK1_RS11375\|product="colibactin non-ribosomal peptide synthetase ClbH"\|protein_id=WP_001304254.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|1269|+|1269|1|GCF_000714595.1_ASM71459v1_genomic.fna|clbG\|locus_tag=ECOK1_RS11380\|product="colibactin biosynthesis acyltransferase ClbG"\|protein_id=WP_000159201.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|1131|+|1131|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbF\|locus_tag=ECOK1_RS11385\|product="colibactin biosynthesis dehydrogenase ClbF"\|protein_id=WP_000337350.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|249|+|249|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbE\|locus_tag=ECOK1_RS11390\|product="colibactin biosynthesis aminomalonyl-acyl carrier protein ClbE"\|protein_id=WP_001297917.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|870|+|870|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbD\|locus_tag=ECOK1_RS11395\|product="colibactin biosynthesis dehydrogenase ClbD"\|protein_id=WP_000982270.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|2601|+|2601|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbC\|locus_tag=ECOK1_RS11400\|product="colibactin polyketide synthase ClbC"\|protein_id=WP_001297908.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|9621|+|9622|1|GCF_000714595.1_ASM71459v1_genomic.fna|clbB\|locus_tag=ECOK1_RS11405\|product="colibactin hybrid non-ribosomal peptide synthetase/type I polyketide synthase ClbB"\|protein_id=WP_001518711.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|213|+|213|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbR\|locus_tag=ECOK1_RS11410\|product="colibactin biosynthesis LuxR family transcriptional regulator ClbR"\|protein_id=WP_000357141.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|735|+|735|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbA\|locus_tag=ECOK1_RS11415\|product="colibactin biosynthesis phosphopantetheinyl transferase ClbA"\|protein_id=WP_001217110.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|519|+|519|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbS-like_4ce09a| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1|519|+|519|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbS-like_4ce09a| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|216|1464|+|1249|0|GCF_000714595.1_ASM71459v1_genomic.fna|clbL\|locus_tag=ECOK1_RS11355\|product="colibactin biosynthesis amidase ClbL"\|protein_id=WP_001297937.1| +//! |db.fasta|GCF_000714595.1_ASM71459v1_genomic.fna|1156|4167|+|3013|1|GCF_000714595.1_ASM71459v1_genomic.fna|clbN\|locus_tag=ECOK1_RS11345\|product="colibactin non-ribosomal peptide synthetase ClbN"\|protein_id=WP_001327259.1| +//! +//!
+//! //! ## kbo map +//! //! TODO write -//! ``` //! -//! ``` //! #![warn(missing_docs, From a22c30fb67b17f3d21561a1cdcb265a61b77c942 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 5 Nov 2024 13:23:13 +0200 Subject: [PATCH 25/44] Add example to kbo map. --- src/lib.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index b9f6c06..a4ba159 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -187,8 +187,33 @@ //! //! ## kbo map //! -//! TODO write +//! kbo map can be used to align a query sequence against a reference sequence. +//! This is useful in for example generating a reference-based alignment of +//! multiple related genomes against a good reference assembly. //! +//! To run this example, download the genome sequence of the [_E. coli_ +//! UTI89](https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000013265.1/) strain +//! from the NCBI (ASM1326v1). +//! +//! ### Reference-based alignment +//! Run +//! ```text +//! kbo map --reference GCF_000714595.1_ASM71459v1_genomic.fna GCF_000013265.1_ASM1326v1_genomic.fna > result.aln +//! ``` +//! +//! which will write the alignment sequence to `result.aln`. Note that kbo map +//! always writes to stdout. +//! +//! If you have multiple sequences you need to align, either supply them as +//! arguments to `kbo map` or process them using gnu parallel: +//! +//! ```text +//! parallel -j 'kbo map --reference GCF_000714595.1_ASM71459v1_genomic.fna {}' < query_paths.txt > result.aln +//! ``` +//! +//! kbo map also accepts the `--threads` argument to parallelise either the +//! index construction (in the case of a single query), or run in parallel over +//! the input files (multiple queries). //! #![warn(missing_docs, From 10bf5652ad225eec45f0994d0c207d967efe77f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 5 Nov 2024 15:28:06 +0200 Subject: [PATCH 26/44] Add identity and coverage computation to find. --- src/cli/main.rs | 54 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 130bafb..16197cd 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -148,24 +148,31 @@ fn main() { find_opts.max_gap_len = *max_gap_len as usize; find_opts.max_gaps = *max_gaps as usize; - let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String)> = Vec::new(); + let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String, usize)> = Vec::new(); if index_prefix.is_some() && !ref_file.is_some() { info!("Loading SBWT index..."); - indexes.push((kbo::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.clone().unwrap())); + let (sbwt, lcs) = kbo::index::load_sbwt(index_prefix.as_ref().unwrap()); + let n_kmers = match sbwt { + sbwt::SbwtIndexVariant::SubsetMatrix(ref index) => { + index.n_kmers() + }, + }; + indexes.push(((sbwt, lcs), index_prefix.clone().unwrap(), n_kmers + *kmer_size - 1)); } else if !index_prefix.is_some() && ref_file.is_some() { info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); if !*detailed { let ref_data = read_fastx_file(ref_file.as_ref().unwrap()); - indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap())); + let n_bases = ref_data.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); + indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap(), n_bases)); } else { let mut reader = needletail::parse_fastx_file(ref_file.as_ref().unwrap()).expect("valid path/file"); while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { let contig = seqrec.id(); let contig_name = std::str::from_utf8(contig).expect("UTF-8"); let seq = seqrec.normalize(true); - indexes.push((kbo::index::build_sbwt_from_vecs(&[seq.to_vec()], &Some(sbwt_build_options.clone())), contig_name.to_string())); + indexes.push((kbo::index::build_sbwt_from_vecs(&[seq.to_vec()], &Some(sbwt_build_options.clone())), contig_name.to_string(), seq.len())); } } @@ -180,38 +187,55 @@ fn main() { .unwrap(); info!("Querying SBWT index..."); - println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tquery.contig\tref.contig"); + println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tidentity\tcoverage\tquery.contig\tref.contig"); let stdout = std::io::stdout(); query_files.iter().for_each(|file| { - let mut run_lengths: Vec<(usize, usize, char, usize, usize, String, String)> = indexes.par_iter().map(|((sbwt, lcs), ref_contig)| { + let mut run_lengths: Vec<(kbo::format::RLE, char, String, String, usize)> = indexes.par_iter().map(|((sbwt, lcs), ref_contig, ref_bases)| { let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); - let mut res: Vec<(usize, usize, char, usize, usize, String, String)> = Vec::new(); + let mut res: Vec<(kbo::format::RLE, char, String, String, usize)> = Vec::new(); while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { let query_contig = std::str::from_utf8(seqrec.id()).expect("UTF-8"); let seq = seqrec.normalize(true); // Get local alignments for forward strand res.append(&mut kbo::find(&seq, &sbwt, &lcs, find_opts.clone()) .iter() - .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, - ref_contig.clone(), query_contig.to_string().clone())).collect()); + .map(|x| (*x, '+', + ref_contig.clone(), query_contig.to_string().clone(), + ref_bases.clone() + )).collect()); // Add local alignments for reverse _complement res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, find_opts.clone()) .iter() - .map(|x| (x.start, x.end, '+', x.matches + x.mismatches + x.jumps + x.gap_bases, x.mismatches, - ref_contig.clone(), query_contig.to_string().clone())).collect()); + .map(|x| (*x, '-', + ref_contig.clone(), query_contig.to_string().clone(), + ref_bases.clone() + )).collect()); } res }).flatten().collect(); // Sort by q.start - run_lengths.sort_by_key(|x| x.0); + run_lengths.sort_by_key(|(aln, _, _, _, _)| aln.start); // Print results with query and ref name added - run_lengths.iter().filter(|x| x.1 - x.0 + 1 >= *min_len as usize).for_each(|x| { + run_lengths.iter().filter(|x| x.0.end - x.0.start + 1 >= *min_len as usize) + .for_each(|(aln, strand, ref_contig, query_contig, ref_bases)| { + let aln_len = aln.end - aln.start + 1; + let coverage = (aln_len as f64)/(*ref_bases as f64) * 100_f64; + let identity = (aln.matches as f64)/(aln_len as f64) * 100_f64; let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", - file, ref_file.clone().unwrap(), x.0, x.1, x.2, x.3, x.4, x.5, x.6); + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2}\t{:.2}\t{}\t{}", + file, ref_file.clone().unwrap(), + aln.start, + aln.end, + strand, + aln.end - aln.start + 1, + aln.mismatches, + identity, + coverage, + query_contig, + ref_contig); }); }); }, From 67fc71b2a3187ff4db7efe0914abb7e4b9e0b2b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 15 Nov 2024 17:50:00 +0200 Subject: [PATCH 27/44] Update sbwt to 0.3.4. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 43150bd..4f1ffaf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ required-features = ["cli"] [dependencies] ## core -sbwt = "0.3.1" +sbwt = "0.3.4" ## cli clap = { version = "4", features = ["derive"], optional = true} From 0342b0a0788dd50862bfd86fe629d395fe887061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 15 Nov 2024 17:51:17 +0200 Subject: [PATCH 28/44] Print number of gap opens. --- src/cli/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 16197cd..aaefb10 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -187,7 +187,7 @@ fn main() { .unwrap(); info!("Querying SBWT index..."); - println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tidentity\tcoverage\tquery.contig\tref.contig"); + println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tgap_opens\tidentity\tcoverage\tquery.contig\tref.contig"); let stdout = std::io::stdout(); query_files.iter().for_each(|file| { let mut run_lengths: Vec<(kbo::format::RLE, char, String, String, usize)> = indexes.par_iter().map(|((sbwt, lcs), ref_contig, ref_bases)| { @@ -225,13 +225,14 @@ fn main() { let coverage = (aln_len as f64)/(*ref_bases as f64) * 100_f64; let identity = (aln.matches as f64)/(aln_len as f64) * 100_f64; let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2}\t{:.2}\t{}\t{}", + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2}\t{:.2}\t{}\t{}", file, ref_file.clone().unwrap(), aln.start, aln.end, strand, aln.end - aln.start + 1, aln.mismatches, + aln.gap_opens, identity, coverage, query_contig, From 9760520d1338807b1358245313f8f9a58df2cce3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 15 Nov 2024 18:06:54 +0200 Subject: [PATCH 29/44] Use local thread pools for rayon. --- src/cli/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index aaefb10..e97fb69 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -183,7 +183,7 @@ fn main() { rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) .thread_name(|i| format!("rayon-thread-{}", i)) - .build_global() + .build() .unwrap(); info!("Querying SBWT index..."); @@ -272,7 +272,7 @@ fn main() { rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) .thread_name(|i| format!("rayon-thread-{}", i)) - .build_global() + .build() .unwrap(); let ref_data = read_fastx_file(ref_file); From f0dc445a0e7bb5fe07880661c7f9f653411c5eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 15 Nov 2024 18:07:11 +0200 Subject: [PATCH 30/44] Enable bitpacked k-mer sorting in memory. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4f1ffaf..55bbe55 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ required-features = ["cli"] [dependencies] ## core -sbwt = "0.3.4" +sbwt = { version = "0.3.4", features = ["bpks-mem"] } ## cli clap = { version = "4", features = ["derive"], optional = true} From 8b277d1e16743ad445529349d700ae8b915aa1c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Fri, 15 Nov 2024 18:07:28 +0200 Subject: [PATCH 31/44] Build in memory if temp_dir is not given. --- src/index.rs | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/src/index.rs b/src/index.rs index 96decd5..b0dd6ff 100644 --- a/src/index.rs +++ b/src/index.rs @@ -18,6 +18,7 @@ use std::ops::Range; use std::path::PathBuf; use sbwt::BitPackedKmerSorting; +use sbwt::BitPackedKmerSortingMem; use sbwt::SbwtIndexBuilder; use sbwt::SbwtIndexVariant; @@ -116,22 +117,40 @@ pub fn build_sbwt_from_vecs( assert!(!slices.is_empty()); let build_opts = if build_options.is_some() { build_options.clone().unwrap() } else { BuildOpts::default() }; - let temp_dir = if build_opts.temp_dir.is_some() { build_opts.temp_dir.unwrap() } else { std::env::temp_dir().to_str().unwrap().to_string() }; - let algorithm = BitPackedKmerSorting::new() - .mem_gb(build_opts.mem_gb) - .dedup_batches(build_opts.dedup_batches) - .temp_dir(PathBuf::from(OsString::from(temp_dir)).as_path()); + // Use temporary disk space if temp_dir is given, + // otherwise build fully in memory. + let (sbwt, lcs) = if build_opts.temp_dir.is_some() { + let temp_dir = build_opts.temp_dir.unwrap(); + let algorithm = BitPackedKmerSorting::new() + .mem_gb(build_opts.mem_gb) + .dedup_batches(build_opts.dedup_batches) + .temp_dir(PathBuf::from(OsString::from(temp_dir)).as_path()); + + SbwtIndexBuilder::new() + .k(build_opts.k) + .n_threads(build_opts.num_threads) + .add_rev_comp(build_opts.add_revcomp) + .algorithm(algorithm) + .build_lcs(true) + .build_select_support(build_opts.build_select) + .precalc_length(build_opts.prefix_precalc) + .run_from_vecs(slices) + } else { + let algorithm = BitPackedKmerSortingMem::new() + .dedup_batches(build_opts.dedup_batches); + + SbwtIndexBuilder::new() + .k(build_opts.k) + .n_threads(build_opts.num_threads) + .add_rev_comp(build_opts.add_revcomp) + .algorithm(algorithm) + .build_lcs(true) + .build_select_support(build_opts.build_select) + .precalc_length(build_opts.prefix_precalc) + .run_from_vecs(slices) + }; - let (sbwt, lcs) = SbwtIndexBuilder::new() - .k(build_opts.k) - .n_threads(build_opts.num_threads) - .add_rev_comp(build_opts.add_revcomp) - .algorithm(algorithm) - .build_lcs(true) - .build_select_support(build_opts.build_select) - .precalc_length(build_opts.prefix_precalc) - .run_from_vecs(slices); (SbwtIndexVariant::SubsetMatrix(sbwt), lcs.unwrap()) } From a442643c409c7527dd7c6b4dcb863031b9c7904f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 12:33:37 +0200 Subject: [PATCH 32/44] Remove --max-gaps option. --- src/cli/cli.rs | 2 -- src/cli/main.rs | 2 -- 2 files changed, 4 deletions(-) diff --git a/src/cli/cli.rs b/src/cli/cli.rs index 957edf3..718612d 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -83,8 +83,6 @@ pub enum Commands { // // Minimum length to report an alignment #[arg(long = "min-len", default_value_t = 100, help_heading = "Algorithm")] min_len: u64, - #[arg(long = "max-gaps", default_value_t = 0, help_heading = "Algorithm")] - max_gaps: u64, #[arg(long = "max-gap-len", default_value_t = 0, help_heading = "Algorithm")] max_gap_len: u64, diff --git a/src/cli/main.rs b/src/cli/main.rs index e97fb69..9f9c54c 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -124,7 +124,6 @@ fn main() { detailed, min_len, max_gap_len, - max_gaps, max_error_prob, num_threads, kmer_size, @@ -146,7 +145,6 @@ fn main() { let mut find_opts = kbo::FindOpts::default(); find_opts.max_error_prob = *max_error_prob; find_opts.max_gap_len = *max_gap_len as usize; - find_opts.max_gaps = *max_gaps as usize; let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String, usize)> = Vec::new(); From 52faeab3add3fa178b3b02e4dff6837a76652b22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 12:33:50 +0200 Subject: [PATCH 33/44] Control permitted gap opens via gap length only. --- src/format.rs | 28 ++++++++++++++-------------- src/lib.rs | 11 +++-------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/src/format.rs b/src/format.rs index 996f4cd..4c3dd37 100644 --- a/src/format.rs +++ b/src/format.rs @@ -172,7 +172,6 @@ pub fn run_lengths( /// pub fn run_lengths_gapped( aln: &[char], - max_gaps: usize, max_gap_len: usize, ) -> Vec { let mut encodings: Vec = Vec::new(); @@ -189,7 +188,7 @@ pub fn run_lengths_gapped( let mut gap_start = false; let start = i; let mut matches: usize = 0; - while i < aln.len() && (gap_opens <= max_gaps && aln[i] != ' ') { + while i < aln.len() && (aln[i] != ' ') { if aln[i] == '-' && !gap_start { gap_start = true; gap_opens += 1; @@ -207,18 +206,19 @@ pub fn run_lengths_gapped( jumps += (aln[i] == 'R') as usize; i += 1; } - if current_gap_bases <= max_gap_len { - let rle: RLE = RLE{ - start: start + 1, - end: i, - matches, - mismatches: i - start - matches - total_gap_bases, - jumps: jumps / 2, - gap_bases: total_gap_bases, - gap_opens, - }; - encodings.push(rle); - } + + // TODO If the match starts or ends with a gap, the gap should be removed. + + let rle: RLE = RLE{ + start: start + 1, + end: i, + matches, + mismatches: i - start - matches, + jumps: jumps / 2, + gap_bases: total_gap_bases, + gap_opens, + }; + encodings.push(rle); match_start = false; } else { i += 1; diff --git a/src/lib.rs b/src/lib.rs index a4ba159..5ececb5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -237,9 +237,6 @@ pub struct FindOpts { /// Prefix match lengths with probability higher than `max_error_prob` to /// happen at random are considered noise. pub max_error_prob: f64, - /// Maximum number of gap segments (gap opens) allowed before splitting an - /// alignment. - pub max_gaps: usize, /// Maximum length of a single gap segment before splitting an alignment. pub max_gap_len: usize, } @@ -249,7 +246,6 @@ impl Default for FindOpts { /// ```rust /// let mut opts = kbo::FindOpts::default(); /// opts.max_error_prob = 0.0000001; - /// opts.max_gaps = 0; /// opts.max_gap_len = 0; /// # let expected = kbo::FindOpts::default(); /// # assert_eq!(opts, expected); @@ -258,7 +254,6 @@ impl Default for FindOpts { fn default() -> FindOpts { FindOpts { max_error_prob: 0.0000001, - max_gaps: 0, max_gap_len: 0, } } @@ -503,10 +498,10 @@ pub fn find( find_opts: FindOpts, ) -> Vec { let mut match_opts = MatchOpts::default(); - match_opts.max_error_prob = find_opts.max_error_prob.clone(); + match_opts.max_error_prob = find_opts.max_error_prob; let aln = matches(query_seq, sbwt, lcs, match_opts); - if find_opts.max_gaps > 0 || find_opts.max_gap_len > 0 { - format::run_lengths_gapped(&aln, find_opts.max_gaps, find_opts.max_gap_len) + if find_opts.max_gap_len > 0 { + format::run_lengths_gapped(&aln, find_opts.max_gap_len) } else { format::run_lengths(&aln) } From 2a4f6a18ead7a64185d9980dd78d65158f9d9fdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 13:52:11 +0200 Subject: [PATCH 34/44] Discard gaps at the end of a match. --- src/format.rs | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/src/format.rs b/src/format.rs index 4c3dd37..2919b1b 100644 --- a/src/format.rs +++ b/src/format.rs @@ -209,15 +209,29 @@ pub fn run_lengths_gapped( // TODO If the match starts or ends with a gap, the gap should be removed. - let rle: RLE = RLE{ - start: start + 1, - end: i, - matches, - mismatches: i - start - matches, - jumps: jumps / 2, - gap_bases: total_gap_bases, - gap_opens, - }; + let rle: RLE = + if aln[i] == '-' { + // Don't count gaps at the end of a a match + RLE{ + start: start + 1, + end: i - current_gap_bases, + matches, + mismatches: i - start - matches - current_gap_bases + 1, + jumps: jumps / 2, + gap_bases: total_gap_bases - current_gap_bases, + gap_opens: gap_opens - 1, + } + } else { + RLE{ + start: start + 1, + end: i, + matches, + mismatches: i - start - matches, + jumps: jumps / 2, + gap_bases: total_gap_bases, + gap_opens, + } + }; encodings.push(rle); match_start = false; } else { From 07f5dfc74bf3f1f731c0288ca0ef29e4283700a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 13:57:37 +0200 Subject: [PATCH 35/44] Use bracket list initialisation. --- src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5ececb5..f1b901c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -497,8 +497,7 @@ pub fn find( lcs: &sbwt::LcsArray, find_opts: FindOpts, ) -> Vec { - let mut match_opts = MatchOpts::default(); - match_opts.max_error_prob = find_opts.max_error_prob; + let match_opts = MatchOpts { max_error_prob: find_opts.max_error_prob }; let aln = matches(query_seq, sbwt, lcs, match_opts); if find_opts.max_gap_len > 0 { format::run_lengths_gapped(&aln, find_opts.max_gap_len) From b2ac4a3d7ef780b19b0d8ec51e7f8365e6791a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 13:58:35 +0200 Subject: [PATCH 36/44] Remove unnecessary references. --- src/cli/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index 9f9c54c..c1d9e95 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -195,7 +195,7 @@ fn main() { let query_contig = std::str::from_utf8(seqrec.id()).expect("UTF-8"); let seq = seqrec.normalize(true); // Get local alignments for forward strand - res.append(&mut kbo::find(&seq, &sbwt, &lcs, find_opts.clone()) + res.append(&mut kbo::find(&seq, sbwt, lcs, find_opts.clone()) .iter() .map(|x| (*x, '+', ref_contig.clone(), query_contig.to_string().clone(), @@ -203,7 +203,7 @@ fn main() { )).collect()); // Add local alignments for reverse _complement - res.append(&mut kbo::find(&seq.reverse_complement(), &sbwt, &lcs, find_opts.clone()) + res.append(&mut kbo::find(&seq.reverse_complement(), sbwt, lcs, find_opts.clone()) .iter() .map(|x| (*x, '-', ref_contig.clone(), query_contig.to_string().clone(), From 23809c920871d2ce17bd6de566c86052c75b23d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 13:59:39 +0200 Subject: [PATCH 37/44] Remove unnecessary calls to clone(). --- src/cli/main.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index c1d9e95..9c3c51d 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -195,19 +195,19 @@ fn main() { let query_contig = std::str::from_utf8(seqrec.id()).expect("UTF-8"); let seq = seqrec.normalize(true); // Get local alignments for forward strand - res.append(&mut kbo::find(&seq, sbwt, lcs, find_opts.clone()) + res.append(&mut kbo::find(&seq, sbwt, lcs, find_opts) .iter() .map(|x| (*x, '+', ref_contig.clone(), query_contig.to_string().clone(), - ref_bases.clone() + *ref_bases )).collect()); // Add local alignments for reverse _complement - res.append(&mut kbo::find(&seq.reverse_complement(), sbwt, lcs, find_opts.clone()) + res.append(&mut kbo::find(&seq.reverse_complement(), sbwt, lcs, find_opts) .iter() .map(|x| (*x, '-', ref_contig.clone(), query_contig.to_string().clone(), - ref_bases.clone() + *ref_bases )).collect()); } res @@ -282,7 +282,7 @@ fn main() { let mut res: Vec = Vec::new(); ref_data.iter().for_each(|ref_contig| { - res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts.clone())); + res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts)); }); let _ = writeln!(&mut stdout.lock(), From faaace48c118ef75f34dea3bc3c987caa3db94f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 14:03:28 +0200 Subject: [PATCH 38/44] Update run_lengths_gapped test and fix indexing. --- src/format.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/format.rs b/src/format.rs index 2919b1b..b5f5627 100644 --- a/src/format.rs +++ b/src/format.rs @@ -141,6 +141,8 @@ pub fn run_lengths( /// segments (consecutive '-'s) within an alignment block. The gapped segments /// can be at most `max_gap_len` bases long before the alignment is broken. /// +/// Gaps '-' are counted as mismatches. +/// /// This function can be used for both plain and refined translations. /// /// Returns a vector of [Run Length Encodings (RLE)](RLE) structs. @@ -154,7 +156,7 @@ pub fn run_lengths( /// use kbo::format::RLE; /// /// // Parameters : k = 3, threshold = 2 -/// // Parameters : max_gaps = 3, max_gap_len = 3 +/// // Parameters : max_gap_len = 3 /// // /// // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G /// // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C @@ -165,8 +167,8 @@ pub fn run_lengths( /// // (14, 16, 3, 0)] /// /// let input: Vec = vec!['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-']; -/// let run_lengths = run_lengths_gapped(&input, 3, 3); -/// # let expected = vec![RLE{start: 1, end: 18, matches: 12, mismatches: 2, jumps: 1, gap_bases: 4, gap_opens: 2}]; +/// let run_lengths = run_lengths_gapped(&input, 3); +/// # let expected = vec![RLE{start: 1, end: 16, matches: 12, mismatches: 5, jumps: 1, gap_bases: 2, gap_opens: 1}]; /// # assert_eq!(run_lengths, expected); /// ``` /// @@ -210,7 +212,7 @@ pub fn run_lengths_gapped( // TODO If the match starts or ends with a gap, the gap should be removed. let rle: RLE = - if aln[i] == '-' { + if aln[std::cmp::min(i, aln.len() - 1)] == '-' { // Don't count gaps at the end of a a match RLE{ start: start + 1, From 90e1f9667f91f4c046c32d784bfb4363cca545c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 14:09:30 +0200 Subject: [PATCH 39/44] Remove `tests/`. --- tests/data/NZ_CP058217.1_clbS.fna.gz | Bin 400 -> 0 bytes tests/data/clbS.fna.gz | Bin 283 -> 0 bytes tests/map_clbs.rs | 42 --------------------------- 3 files changed, 42 deletions(-) delete mode 100644 tests/data/NZ_CP058217.1_clbS.fna.gz delete mode 100644 tests/data/clbS.fna.gz delete mode 100644 tests/map_clbs.rs diff --git a/tests/data/NZ_CP058217.1_clbS.fna.gz b/tests/data/NZ_CP058217.1_clbS.fna.gz deleted file mode 100644 index 91d73020e7cba8765982d2000f8c1d4c9d07a9b7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 400 zcmV;B0dM{viwFqC0`F!515R3BLr^d^I5IIeE-_zYY+_R`W^Q2svh$0IcMdQxwJ<~jK@KIMSUFO8YHW-N z$egn1BC0LlNu!@l0!NiFGz~2>VM8V*bYo2*3^fG8GJ{176bB)ZE^DI~K#;Sb02c*e z(6fpH0>OpQqL|wTYi^aWTZ>Xq$DE+bVp2H62`#O*Xi4c(MO>a)jHM&lKy(1gvnYDD zL_w*56cS23XN6-3)M7JPBuW%Rj0sjzI;I6EKpgI6&2*KNFsWfMNnFJ=U@MIvAZ44J uHY2G)1+lp`w<4ZgO%*L5U^xV`l{sAJBPvX~UkJJ8bVZPAj_rHY)&Vk{lW2BHH{02W16TjKLr z1*DKrsPEHOojR#w41OX}AM4OS+ hpn}-kx)TAAtEr-|fPm!?$W~_v-xn3`B8+kZ006^ecntsm diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs deleted file mode 100644 index 0e702d2..0000000 --- a/tests/map_clbs.rs +++ /dev/null @@ -1,42 +0,0 @@ -// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search -// -// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. - -// Copyrights in this project are retained by contributors. No copyright assignment -// is required to contribute to this project. - -// Except as otherwise noted (below and/or in individual files), this -// project is licensed under the Apache License, Version 2.0 -// or or -// the MIT license, or , -// at your option. -// -// #[test] -// fn map_nissle_against_clbs() { -// use needletail::Sequence; - -// let mut seq_data: Vec> = Vec::new(); -// let mut reader = needletail::parse_fastx_file("tests/data/clbS.fna.gz".to_string()).unwrap_or_else(|_| panic!("Expected valid fastX file at tests/data/clbS.fna.gz")); -// loop { -// let rec = reader.next(); -// match rec { -// Some(Ok(seqrec)) => { -// seq_data.push(seqrec.normalize(true).as_ref().to_vec()); -// }, -// _ => break -// } -// } - -// let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&seq_data, &None); - -// let expected = vec![(455, 967, 512, 1)]; - -// let mut reader = needletail::parse_fastx_file("tests/data/NZ_CP058217.1_clbS.fna.gz".to_string()).expect("valid path/file"); -// let Some(rec) = reader.next() else { panic!("Couldn't read from tests/data/NZ_CP058217.1_clbS.fna.gz") }; -// let seqrec = rec.expect("Valid fastX record"); -// let seq = seqrec.normalize(true); - -// let got = kbo::find(&seq, &sbwt, &lcs); - -// assert_eq!(got, expected); -// } From e6b888f2831f1f7920e2ad499b3bdf9a32e78725 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 14:09:49 +0200 Subject: [PATCH 40/44] Remove CLI from source (move to separate repo). --- Cargo.toml | 23 ---- src/cli/cli.rs | 162 -------------------------- src/cli/main.rs | 294 ------------------------------------------------ src/lib.rs | 2 +- 4 files changed, 1 insertion(+), 480 deletions(-) delete mode 100644 src/cli/cli.rs delete mode 100644 src/cli/main.rs diff --git a/Cargo.toml b/Cargo.toml index 55bbe55..dcd0c56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,33 +10,10 @@ homepage = "https://github.com/tmaklin/kbo" repository = "https://github.com/tmaklin/kbo" license = "MIT OR Apache-2.0" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[features] -cli = ["dep:needletail", "dep:clap", "dep:rayon", "dep:log", "dep:stderrlog"] - -[[bin]] -name = "kbo" -path = "src/cli/main.rs" -required-features = ["cli"] - [dependencies] ## core sbwt = { version = "0.3.4", features = ["bpks-mem"] } -## cli -clap = { version = "4", features = ["derive"], optional = true} - -### io -needletail = { version = "0.6.0", default-features = false, features = ["flate2"], optional = true } - -### logging -log = { version = "0.4.20", optional = true } -stderrlog = { version = "0.6.0", optional = true } - -### threading -rayon = { version = "1", optional = true } - ## docs embed-doc-image="0.1.4" diff --git a/src/cli/cli.rs b/src/cli/cli.rs deleted file mode 100644 index 718612d..0000000 --- a/src/cli/cli.rs +++ /dev/null @@ -1,162 +0,0 @@ -// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search -// -// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. - -// Copyrights in this project are retained by contributors. No copyright assignment -// is required to contribute to this project. - -// Except as otherwise noted (below and/or in individual files), this -// project is licensed under the Apache License, Version 2.0 -// or or -// the MIT license, or , -// at your option. -// -use clap::{Parser, Subcommand}; - -#[derive(Parser)] -#[command(version)] -#[command(propagate_version = true)] -#[cfg(feature = "cli")] -pub struct Cli { - #[command(subcommand)] - pub command: Option, -} - -#[derive(Subcommand)] -#[cfg(feature = "cli")] -pub enum Commands { - // Build SBWT index - Build { - // Input fasta or fastq sequence file(s) - #[arg(group = "input", required = true)] - seq_files: Vec, - - // Outputs - #[arg(short = 'o', long = "output-prefix", required = false, help_heading = "Output")] - output_prefix: Option, - - // Build parameters - // // k-mer size - #[arg(short = 'k', default_value_t = 31, help_heading = "Build options")] - kmer_size: usize, - // // prefix precalc - #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] - prefix_precalc: usize, - // // deduplicate k-mer batches - #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] - dedup_batches: bool, - - // Resources - // // Threads - #[arg(short = 't', long = "threads", default_value_t = 1)] - num_threads: usize, - // // Memory in GB - #[arg(short = 'm', long = "mem-gb", default_value_t = 4, help_heading = "Build options")] - mem_gb: usize, - // // Temporary directory - #[arg(long = "temp-dir", required = false, help_heading = "Build options")] - temp_dir: Option, - - // Verbosity - #[arg(long = "verbose", default_value_t = false)] - verbose: bool, - }, - - // Find indexed k-mers in a query - Find { - // Input fasta or fastq query file(s) - #[arg(group = "input", required = true)] - query_files: Vec, - - // Reference - // // Sequence file - #[arg(short = 'r', long = "reference", group = "reference", help_heading = "Input")] - ref_file: Option, - // // ... or a prebuilt index - #[arg(short = 'i', long = "index", group = "reference", help_heading = "Input")] - index_prefix: Option, - // // Concatenate contigs in reference - #[arg(long = "detailed", help_heading = "Input", default_value_t = false)] - detailed: bool, - - // Parameters - // // Minimum length to report an alignment - #[arg(long = "min-len", default_value_t = 100, help_heading = "Algorithm")] - min_len: u64, - #[arg(long = "max-gap-len", default_value_t = 0, help_heading = "Algorithm")] - max_gap_len: u64, - - // // Upper bound for random match probability - #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] - max_error_prob: f64, - - // Resources - // // Threads - #[arg(short = 't', long = "threads", default_value_t = 1)] - num_threads: usize, - - // Build parameters - // // k-mer size - #[arg(short = 'k', default_value_t = 31, help_heading = "Build options")] - kmer_size: usize, - // // prefix precalc - #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] - prefix_precalc: usize, - // // deduplicate k-mer batches - #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] - dedup_batches: bool, - // // Memory in GB - #[arg(short = 'm', long = "mem-gb", default_value_t = 4, help_heading = "Build options")] - mem_gb: usize, - // // Temporary directory - #[arg(long = "temp-dir", required = false, help_heading = "Build options")] - temp_dir: Option, - - - // Verbosity - #[arg(long = "verbose", default_value_t = false)] - verbose: bool, - }, - - // Map a query or queries to a reference and return the alignment - Map { - // Input fasta or fastq query file(s) - #[arg(group = "input", required = true)] - query_files: Vec, - - // Reference fasta - #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] - ref_file: String, - - // Parameters - // // Upper bound for random match probability - #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] - max_error_prob: f64, - - // Resources - // // Threads - #[arg(short = 't', long = "threads", default_value_t = 1)] - num_threads: usize, - - // Build parameters - // // k-mer size - #[arg(short = 'k', default_value_t = 31, help_heading = "Build options")] - kmer_size: usize, - // // prefix precalc - #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] - prefix_precalc: usize, - // // deduplicate k-mer batches - #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] - dedup_batches: bool, - // // Memory in GB - #[arg(short = 'm', long = "mem-gb", default_value_t = 4, help_heading = "Build options")] - mem_gb: usize, - // // Temporary directory - #[arg(long = "temp-dir", required = false, help_heading = "Build options")] - temp_dir: Option, - - // Verbosity - #[arg(long = "verbose", default_value_t = false)] - verbose: bool, - }, -} diff --git a/src/cli/main.rs b/src/cli/main.rs deleted file mode 100644 index 9c3c51d..0000000 --- a/src/cli/main.rs +++ /dev/null @@ -1,294 +0,0 @@ -// kbo: Spectral Burrows-Wheeler transform accelerated local alignment search -// -// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. - -// Copyrights in this project are retained by contributors. No copyright assignment -// is required to contribute to this project. - -// Except as otherwise noted (below and/or in individual files), this -// project is licensed under the Apache License, Version 2.0 -// or or -// the MIT license, or , -// at your option. -// -use std::io::Write; - -use clap::Parser; -use log::info; -use needletail::Sequence; -use needletail::parser::SequenceRecord; -use rayon::iter::ParallelIterator; -use rayon::iter::IntoParallelRefIterator; - -// Command-line interface -mod cli; - -// Given a needletail parser, reads the next contig sequence -fn read_from_fastx_parser( - reader: &mut dyn needletail::parser::FastxReader, -) -> Option { - let rec = reader.next(); - match rec { - Some(Ok(seqrec)) => { - Some(seqrec) - }, - None => None, - Some(Err(_)) => todo!(), - } -} - -// Reads all sequence data from a fastX file -fn read_fastx_file( - file: &str, -) -> Vec> { - let mut seq_data: Vec> = Vec::new(); - let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); - while let Some(rec) = read_from_fastx_parser(&mut *reader) { - let seqrec = rec.normalize(true); - seq_data.push(seqrec.to_vec()); - } - seq_data -} - -/// Initializes the logger with verbosity given in `log_max_level`. -fn init_log(log_max_level: usize) { - stderrlog::new() - .module(module_path!()) - .quiet(false) - .verbosity(log_max_level) - .timestamp(stderrlog::Timestamp::Off) - .init() - .unwrap(); -} - -/// Use `kbo` to list the available commands or `kbo ` to run. -/// -/// # Input format detection -/// The sequence data is read using -/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). -/// -/// Input file format (fasta or fastq) is detected automatically and -/// the files may be compressed in a -/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz -/// files). -/// -/// [Bzip2](https://sourceware.org/bzip2/) and -/// [liblzma](https://tukaani.org/xz/) compression (.bz2 and .xz -/// files) can be enabled using the needletail features field in -/// kbo Cargo.toml if compiling from source. -/// -#[allow(clippy::needless_update)] -fn main() { - let cli = cli::Cli::parse(); - - // Subcommands: - match &cli.command { - Some(cli::Commands::Build { - seq_files, - output_prefix, - kmer_size, - prefix_precalc, - dedup_batches, - num_threads, - mem_gb, - temp_dir, - verbose, - }) => { - init_log(if *verbose { 2 } else { 1 }); - - let mut sbwt_build_options = kbo::index::BuildOpts::default(); - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); - - info!("Building SBWT index from {} files...", seq_files.len()); - let mut seq_data: Vec> = Vec::new(); - seq_files.iter().for_each(|file| { - seq_data.append(&mut read_fastx_file(file)); - }); - - let (sbwt, lcs) = kbo::build(&seq_data, sbwt_build_options); - - info!("Serializing SBWT index to {}.sbwt ...", output_prefix.as_ref().unwrap()); - info!("Serializing LCS array to {}.lcs ...", output_prefix.as_ref().unwrap()); - kbo::index::serialize_sbwt(output_prefix.as_ref().unwrap(), &sbwt, &lcs); - - }, - Some(cli::Commands::Find { - query_files, - ref_file, - index_prefix, - detailed, - min_len, - max_gap_len, - max_error_prob, - num_threads, - kmer_size, - prefix_precalc, - dedup_batches, - mem_gb, - temp_dir, - verbose, - }) => { - init_log(if *verbose { 2 } else { 1 }); - let mut sbwt_build_options = kbo::index::BuildOpts::default(); - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); - - let mut find_opts = kbo::FindOpts::default(); - find_opts.max_error_prob = *max_error_prob; - find_opts.max_gap_len = *max_gap_len as usize; - - let mut indexes: Vec<((sbwt::SbwtIndexVariant, sbwt::LcsArray), String, usize)> = Vec::new(); - - if index_prefix.is_some() && !ref_file.is_some() { - info!("Loading SBWT index..."); - let (sbwt, lcs) = kbo::index::load_sbwt(index_prefix.as_ref().unwrap()); - let n_kmers = match sbwt { - sbwt::SbwtIndexVariant::SubsetMatrix(ref index) => { - index.n_kmers() - }, - }; - indexes.push(((sbwt, lcs), index_prefix.clone().unwrap(), n_kmers + *kmer_size - 1)); - } else if !index_prefix.is_some() && ref_file.is_some() { - info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); - - if !*detailed { - let ref_data = read_fastx_file(ref_file.as_ref().unwrap()); - let n_bases = ref_data.iter().map(|x| x.len()).reduce(|a, b| a + b).unwrap(); - indexes.push((kbo::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.clone().unwrap(), n_bases)); - } else { - let mut reader = needletail::parse_fastx_file(ref_file.as_ref().unwrap()).expect("valid path/file"); - while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { - let contig = seqrec.id(); - let contig_name = std::str::from_utf8(contig).expect("UTF-8"); - let seq = seqrec.normalize(true); - indexes.push((kbo::index::build_sbwt_from_vecs(&[seq.to_vec()], &Some(sbwt_build_options.clone())), contig_name.to_string(), seq.len())); - } - } - - } else { - panic!("Ambiguous reference, supply only one of `-r/--reference` and `-i/--index`"); - }; - - rayon::ThreadPoolBuilder::new() - .num_threads(*num_threads) - .thread_name(|i| format!("rayon-thread-{}", i)) - .build() - .unwrap(); - - info!("Querying SBWT index..."); - println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tgap_opens\tidentity\tcoverage\tquery.contig\tref.contig"); - let stdout = std::io::stdout(); - query_files.iter().for_each(|file| { - let mut run_lengths: Vec<(kbo::format::RLE, char, String, String, usize)> = indexes.par_iter().map(|((sbwt, lcs), ref_contig, ref_bases)| { - let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); - let mut res: Vec<(kbo::format::RLE, char, String, String, usize)> = Vec::new(); - while let Some(seqrec) = read_from_fastx_parser(&mut *reader) { - let query_contig = std::str::from_utf8(seqrec.id()).expect("UTF-8"); - let seq = seqrec.normalize(true); - // Get local alignments for forward strand - res.append(&mut kbo::find(&seq, sbwt, lcs, find_opts) - .iter() - .map(|x| (*x, '+', - ref_contig.clone(), query_contig.to_string().clone(), - *ref_bases - )).collect()); - - // Add local alignments for reverse _complement - res.append(&mut kbo::find(&seq.reverse_complement(), sbwt, lcs, find_opts) - .iter() - .map(|x| (*x, '-', - ref_contig.clone(), query_contig.to_string().clone(), - *ref_bases - )).collect()); - } - res - }).flatten().collect(); - - // Sort by q.start - run_lengths.sort_by_key(|(aln, _, _, _, _)| aln.start); - - // Print results with query and ref name added - run_lengths.iter().filter(|x| x.0.end - x.0.start + 1 >= *min_len as usize) - .for_each(|(aln, strand, ref_contig, query_contig, ref_bases)| { - let aln_len = aln.end - aln.start + 1; - let coverage = (aln_len as f64)/(*ref_bases as f64) * 100_f64; - let identity = (aln.matches as f64)/(aln_len as f64) * 100_f64; - let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{:.2}\t{:.2}\t{}\t{}", - file, ref_file.clone().unwrap(), - aln.start, - aln.end, - strand, - aln.end - aln.start + 1, - aln.mismatches, - aln.gap_opens, - identity, - coverage, - query_contig, - ref_contig); - }); - }); - }, - - Some(cli::Commands::Map { - query_files, - ref_file, - max_error_prob, - num_threads, - kmer_size, - prefix_precalc, - dedup_batches, - mem_gb, - temp_dir, - verbose, - }) => { - init_log(if *verbose { 2 } else { 1 }); - let mut sbwt_build_options = kbo::index::BuildOpts::default(); - // These are required for the subcommand to work correctly - sbwt_build_options.add_revcomp = true; - sbwt_build_options.build_select = true; - // These can be adjusted - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); - - let mut map_opts = kbo::MapOpts::default(); - map_opts.max_error_prob = *max_error_prob; - - rayon::ThreadPoolBuilder::new() - .num_threads(*num_threads) - .thread_name(|i| format!("rayon-thread-{}", i)) - .build() - .unwrap(); - - let ref_data = read_fastx_file(ref_file); - - let stdout = std::io::stdout(); - query_files.par_iter().for_each(|query_file| { - let query_data = read_fastx_file(query_file); - let (sbwt, lcs) = kbo::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); - - let mut res: Vec = Vec::new(); - ref_data.iter().for_each(|ref_contig| { - res.append(&mut kbo::map(ref_contig, &sbwt, &lcs, map_opts)); - }); - - let _ = writeln!(&mut stdout.lock(), - ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); - }); - }, - None => {} - } -} diff --git a/src/lib.rs b/src/lib.rs index f1b901c..6d588d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,7 +34,7 @@ //! fast retrieval of the _k_-bounded matching statistic for each _k_-mer match. //! //! # Installing the kbo executable -//! Run `cargo build --features cli --release`. +//! See installation instructions at [GitHub](https://github.com/tmaklin/kbo-cli). //! //! # Usage //! From 9e51504b9ab2c807889661b260d190f35575c870 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 14:14:09 +0200 Subject: [PATCH 41/44] Remove CLI binary related workflows. --- .github/deploy/build_aarch64-apple-darwin.sh | 51 ---------- .github/deploy/build_x86_64-apple-darwin.sh | 51 ---------- .github/workflows/build_and_test.yml | 4 +- .github/workflows/build_artifacts.yml | 99 -------------------- .github/workflows/make_release.yml | 30 ------ 5 files changed, 2 insertions(+), 233 deletions(-) delete mode 100644 .github/deploy/build_aarch64-apple-darwin.sh delete mode 100644 .github/deploy/build_x86_64-apple-darwin.sh delete mode 100644 .github/workflows/build_artifacts.yml diff --git a/.github/deploy/build_aarch64-apple-darwin.sh b/.github/deploy/build_aarch64-apple-darwin.sh deleted file mode 100644 index 671b016..0000000 --- a/.github/deploy/build_aarch64-apple-darwin.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -## Build script for cross-compiling kbo for aarch64-apple-darwin. -## Run this inside https://github.com/shepherdjerred/macos-cross-compiler - -set -exo pipefail - -VER=$1 -if [[ -z $VER ]]; then - echo "Error: specify version" - exit; -fi - -rustup default stable - -mkdir /io/tmp -cd /io/tmp - -# Extract and enter source -git clone https://github.com/tmaklin/sablast.git -cd sablast -git checkout ${VER} - -mkdir -p .cargo -# Rust toolchain -rustup target add aarch64-apple-darwin - -echo "[build]" >> .cargo/config.toml -echo "target = \"aarch64-apple-darwin\"" >> .cargo/config.toml -echo "[target.aarch64-apple-darwin]" >> .cargo/config.toml -echo "linker = \"aarch64-apple-darwin22-gcc\"" >> .cargo/config.toml - -export CC="aarch64-apple-darwin22-gcc" -export CXX="aarch64-apple-darwin22-g++" - -RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --all-features --release --target aarch64-apple-darwin - -## gather the stuff to distribute -target=kbo-candidate-aarch64-apple-darwin -path=/io/tmp/$target -mkdir $path -cp target/aarch64-apple-darwin/release/kbo $path/ -cp README.md $path/ -cp COPYRIGHT $path/ -cp LICENSE-APACHE $path/ -cp LICENSE-MIT $path/ -cd /io/tmp -tar -zcvf $target.tar.gz $target -sha256sum $target.tar.gz > $target".tar.gz.sha256sum" -mv $target.tar.gz /io/ -mv $target".tar.gz.sha256sum" /io/ -cd /io/ diff --git a/.github/deploy/build_x86_64-apple-darwin.sh b/.github/deploy/build_x86_64-apple-darwin.sh deleted file mode 100644 index f4056e4..0000000 --- a/.github/deploy/build_x86_64-apple-darwin.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -## Build script for cross-compiling kbo for aarch64-apple-darwin. -## Run this inside https://github.com/shepherdjerred/macos-cross-compiler - -set -exo pipefail - -VER=$1 -if [[ -z $VER ]]; then - echo "Error: specify version" - exit; -fi - -rustup default stable - -mkdir /io/tmp -cd /io/tmp - -# Extract and enter source -git clone https://github.com/tmaklin/sablast.git -cd sablast -git checkout ${VER} - -mkdir -p .cargo -# Rust toolchain -rustup target add x86_64-apple-darwin - -echo "[build]" >> .cargo/config.toml -echo "target = \"x86_64-apple-darwin\"" >> .cargo/config.toml -echo "[target.x86_64-apple-darwin]" >> .cargo/config.toml -echo "linker = \"x86_64-apple-darwin22-gcc\"" >> .cargo/config.toml - -export CC="x86_64-apple-darwin22-gcc" -export CXX="x86_64-apple-darwin22-g++" - -RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --all-features --release --target x86_64-apple-darwin - -## gather the stuff to distribute -target=kbo-candidate-x86_64-apple-darwin -path=/io/tmp/$target -mkdir $path -cp target/x86_64-apple-darwin/release/kbo $path/ -cp README.md $path/ -cp COPYRIGHT $path/ -cp LICENSE-APACHE $path/ -cp LICENSE-MIT $path/ -cd /io/tmp -tar -zcvf $target.tar.gz $target -sha256sum $target.tar.gz > $target".tar.gz.sha256sum" -mv $target.tar.gz /io/ -mv $target".tar.gz.sha256sum" /io/ -cd /io/ diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 63b3a35..c27b3a7 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -25,7 +25,7 @@ jobs: run: rustup update ${{ matrix.toolchain }} && rustup default ${{ matrix.toolchain }} - name: Build binary - run: cargo build --features cli --verbose + run: cargo build --all-features --verbose - name: Run unit and integration tests - run: cargo test --no-fail-fast --verbose + run: cargo test --all-features --no-fail-fast --verbose diff --git a/.github/workflows/build_artifacts.yml b/.github/workflows/build_artifacts.yml deleted file mode 100644 index 39c1a88..0000000 --- a/.github/workflows/build_artifacts.yml +++ /dev/null @@ -1,99 +0,0 @@ -name: Build artifacts for release - -on: - pull_request: - branches: [main] - -jobs: - build_linux-x86_64: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v3 - - name: Compile - id: compile - uses: rust-build/rust-build.action@v1.4.5 - with: - ARCHIVE_TYPES: tar.gz - ARCHIVE_NAME: kbo-candidate-x86_64-unknown-linux-musl - RUSTTARGET: x86_64-unknown-linux-musl - EXTRA_FEATURES: "cli" - EXTRA_FILES: "COPYRIGHT LICENSE-APACHE LICENSE-MIT README.md" - TOOLCHAIN_VERSION: stable-2024-09-05 - UPLOAD_MODE: none - - - name: Upload x86_64-unknown-linux-musl - uses: actions/upload-artifact@v4 - with: - name: kbo-candidate-x86_64-unknown-linux-musl - path: | - ${{ steps.compile.outputs.BUILT_ARCHIVE }} - ${{ steps.compile.outputs.BUILT_CHECKSUM }} - - build_macOS-x86_64: - runs-on: ubuntu-latest - container: ghcr.io/shepherdjerred/macos-cross-compiler:latest - steps: - - name: Install wget and git - id: install-wget - run: apt-get update && apt install -y wget git - - - name: Create io directory - id: mkdir-io - run: mkdir /io && cd /io - - - name: Download build script - id: dl-build-script - run: wget https://raw.githubusercontent.com/${{ github.repository }}/${{ github.head_ref }}/.github/deploy/build_x86_64-apple-darwin.sh - - - name: Compile in macOS Cross Compiler container - id: compile-in-container - run: chmod +x build_x86_64-apple-darwin.sh && ./build_x86_64-apple-darwin.sh ${{ github.head_ref }} arm64 - - - name: Upload x86_64-apple-darwin - if: success() - uses: actions/upload-artifact@v4 - with: - name: kbo-candidate-x86_64-apple-darwin - path: /io/kbo-candidate-x86_64-apple-darwin.tar.gz - - - name: Upload x86_64-apple-darwin sha256sum - if: success() - uses: actions/upload-artifact@v4 - with: - name: kbo-candidate-x86_64-apple-darwin-sha256sum - path: /io/kbo-candidate-x86_64-apple-darwin.tar.gz.sha256sum - - build_macOS-aarch64: - runs-on: ubuntu-latest - container: ghcr.io/shepherdjerred/macos-cross-compiler:latest - steps: - - name: Install wget and git - id: install-wget - run: apt-get update && apt install -y wget git - - - name: Create io directory - id: mkdir-io - run: mkdir /io && cd /io - - - name: Download build script - id: dl-build-script - run: wget https://raw.githubusercontent.com/${{ github.repository }}/${{ github.head_ref }}/.github/deploy/build_aarch64-apple-darwin.sh - - - name: Compile in macOS Cross Compiler container - id: compile-in-container - run: chmod +x build_aarch64-apple-darwin.sh && ./build_aarch64-apple-darwin.sh ${{ github.head_ref }} - - - name: Upload aarch64-apple-darwin - if: success() - uses: actions/upload-artifact@v4 - with: - name: kbo-candidate-aarch64-apple-darwin - path: /io/kbo-candidate-aarch64-apple-darwin.tar.gz - - - name: Upload aarch64-apple-darwin sha256sum - if: success() - uses: actions/upload-artifact@v4 - with: - name: kbo-candidate-aarch64-apple-darwin-sha256sum - path: /io/kbo-candidate-aarch64-apple-darwin.tar.gz.sha256sum diff --git a/.github/workflows/make_release.yml b/.github/workflows/make_release.yml index c27efb1..5c8558c 100644 --- a/.github/workflows/make_release.yml +++ b/.github/workflows/make_release.yml @@ -15,32 +15,6 @@ jobs: environment: Make kbo release steps: - - name: Download artifacts - uses: dawidd6/action-download-artifact@v6 - with: - workflow: build_artifacts.yml - workflow_conclusion: success - - - name: Organise files - shell: bash - run: | - mv kbo-candidate-x86_64-unknown-linux-musl/kbo-candidate-x86_64-unknown-linux-musl.tar.gz ./ - mv kbo-candidate-x86_64-apple-darwin/kbo-candidate-x86_64-apple-darwin.tar.gz ./ - mv kbo-candidate-aarch64-apple-darwin/kbo-candidate-aarch64-apple-darwin.tar.gz ./ - - - name: Rename mac artifacts - shell: bash - run: | - tar -zxvf kbo-candidate-x86_64-apple-darwin.tar.gz && mv kbo-candidate-x86_64-apple-darwin kbo-${{ github.ref_name }}-x86_64-apple-darwin && tar -zcvf kbo-candidate-x86_64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-x86_64-apple-darwin - tar -zxvf kbo-candidate-aarch64-apple-darwin.tar.gz && mv kbo-candidate-aarch64-apple-darwin kbo-${{ github.ref_name }}-aarch64-apple-darwin && tar -zcvf kbo-candidate-aarch64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-aarch64-apple-darwin - - - name: Designate version - shell: bash - run: | - mv kbo-candidate-x86_64-unknown-linux-musl.tar.gz kbo-${{ github.ref_name }}-x86_64-linux-musl.tar.gz - mv kbo-candidate-x86_64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-x86_64-apple-darwin.tar.gz - mv kbo-candidate-aarch64-apple-darwin.tar.gz kbo-${{ github.ref_name }}-aarch64-apple-darwin.tar.gz - - name: Get current date id: date run: echo "::set-output name=date::$(date +'%d %B %Y')" @@ -54,7 +28,3 @@ jobs: prerelease: false fail_on_unmatched_files: true generate_release_notes: true - files: | - kbo-${{ github.ref_name }}-x86_64-linux-musl.tar.gz - kbo-${{ github.ref_name }}-x86_64-apple-darwin.tar.gz - kbo-${{ github.ref_name }}-aarch64-apple-darwin.tar.gz \ No newline at end of file From f4f3cbea919be84ae94d371cc7dde84646f04b6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 14:19:50 +0200 Subject: [PATCH 42/44] Remove cleared TODO. --- src/format.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/format.rs b/src/format.rs index b5f5627..ed02dea 100644 --- a/src/format.rs +++ b/src/format.rs @@ -209,8 +209,6 @@ pub fn run_lengths_gapped( i += 1; } - // TODO If the match starts or ends with a gap, the gap should be removed. - let rle: RLE = if aln[std::cmp::min(i, aln.len() - 1)] == '-' { // Don't count gaps at the end of a a match From 954ec91ec41b2f8bd179627fb1b2f63772855c50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 14:20:11 +0200 Subject: [PATCH 43/44] Bump minor version. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index dcd0c56..9fa2a41 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kbo" -version = "0.3.0" +version = "0.4.0" edition = "2021" rust-version = "1.77.0" authors = ["Tommi Mäklin "] From fe64a38da4913d345a06ac144fc6bb8af3f47338 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 18 Nov 2024 14:29:04 +0200 Subject: [PATCH 44/44] Update README. --- README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.md b/README.md index b48b344..0d4be61 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,35 @@ # kbo Spectral Burrows-Wheeler transform accelerated local alignment search. +kbo is an approximate local aligner based on converting [_k_-bounded matching +statistics](https://www.biorxiv.org/content/10.1101/2024.02.19.580943v1) +into a character representation of the underlying alignment sequence. + +Documentation is available at [https://docs.rs/kbo](https://docs.rs/kbo). + +## Installation +kbo is distributed as three separate Rust packages: +- [kbo](https://github.com/tmaklin/kbo) contains a Rust library implementing the core algorithm (this repository). +- [kbo-cli](https://github.com/tmaklin/kbo-cli) provides a command-line interface for `kbo find` and `kbo map`. +- [kbo-gui](https://github.com/tmaklin/kbo-cli) provides a WebAssembly graphical user interface for running kbo in the browser. + +## About +kbo supports two main operations: + +- `kbo find` matches the _k_-mers in a query sequence with the + reference and reports the local alignment segments found within the + reference. Find is useful for problems that can be solved with + [blast](https://blast.ncbi.nlm.nih.gov/Blast.cgi). +- `kbo map` maps the query sequence against a reference + sequence, and reports the nucleotide sequence of the alignment relative to + the reference. Map solves the same problem as + [snippy](https://github.com/tseemann/snippy) and [ska + map](https://docs.rs/ska/latest/ska/#ska-map). + +kbo uses the [Spectral Burrows-Wheeler +Transform](https://docs.rs/sbwt/latest/sbwt/) data structure that allows +efficient _k_-mer matching between a target and a query sequence and +fast retrieval of the _k_-bounded matching statistic for each _k_-mer match. + ## License kbo is dual-licensed under the [MIT](LICENSE-MIT) and [Apache 2.0](LICENSE-APACHE) licenses.