From 8b1adb42d15dfd2556baa4861dd0849dde955120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 1 Oct 2024 22:21:56 +0300 Subject: [PATCH 01/21] Gate building the command line interface behind feature cli. --- Cargo.toml | 30 +++++++++++++++++++++--------- src/cli.rs | 2 ++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c4339af..7cb2a8b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,24 +12,36 @@ license = "MIT OR Apache-2.0" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +cli = ["dep:needletail", "dep:clap", "dep:rayon", "dep:log", "dep:stderrlog"] + +[[bin]] +name = "sablast" +path = "src/main.rs" +required-features = ["cli"] + [dependencies] ## core -# TODO Re-enable reading compressed sequences in needletail -# This requires resolving the libllzma linker issue in build_artifacts.yml -needletail = { version = "0.5.1", default-features = false } -rayon = "1" sbwt = "0.3.1" ## cli -clap = { version = "4", features = ["derive"] } +clap = { version = "4", features = ["derive"], optional = true} + +### io +#### TODO Re-enable reading compressed sequences in needletail +##### This requires resolving the libllzma linker issue in build_artifacts.yml +needletail = { version = "0.5.1", default-features = false, optional = true } + +### logging +log = { version = "0.4.20", optional = true } +stderrlog = { version = "0.6.0", optional = true } + +### threading +rayon = { version = "1", optional = true } ## docs embed-doc-image="0.1.4" -## logging -log = "0.4.20" -stderrlog = "0.6.0" - [dev-dependencies] ## tests assert_approx_eq = "1" diff --git a/src/cli.rs b/src/cli.rs index 0d2fd8c..758f912 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -16,12 +16,14 @@ use clap::{Parser, Subcommand}; #[derive(Parser)] #[command(version)] #[command(propagate_version = true)] +#[cfg(feature = "cli")] pub struct Cli { #[command(subcommand)] pub command: Option, } #[derive(Subcommand)] +#[cfg(feature = "cli")] pub enum Commands { // Build SBWT index Build { From cefff2a6ab4162ddcb024487b576b989c1da7ad2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 1 Oct 2024 22:22:15 +0300 Subject: [PATCH 02/21] Build command line interface for testing and packaging. --- .github/workflows/build_and_test.yml | 5 +---- .github/workflows/build_artifacts.yml | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 9fcc6c8..63b3a35 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -25,10 +25,7 @@ jobs: run: rustup update ${{ matrix.toolchain }} && rustup default ${{ matrix.toolchain }} - name: Build binary - run: cargo build --verbose + run: cargo build --features cli --verbose - name: Run unit and integration tests run: cargo test --no-fail-fast --verbose - - - name: Run documenation examples as tests - run: cargo test --doc --verbose diff --git a/.github/workflows/build_artifacts.yml b/.github/workflows/build_artifacts.yml index 4108727..6e45b97 100644 --- a/.github/workflows/build_artifacts.yml +++ b/.github/workflows/build_artifacts.yml @@ -17,6 +17,7 @@ jobs: ARCHIVE_TYPES: tar.gz ARCHIVE_NAME: sablast-candidate-x86_64-unknown-linux-musl RUSTTARGET: x86_64-unknown-linux-musl + EXTRA_FEATURES: "cli" EXTRA_FILES: "COPYRIGHT LICENSE-APACHE LICENSE-MIT README.md" TOOLCHAIN_VERSION: stable-2024-09-05 UPLOAD_MODE: none From b8fae8807a208480a4ad49682e7e5f4883773c9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 1 Oct 2024 22:30:40 +0300 Subject: [PATCH 03/21] Move CLI sources to src/cli/. --- Cargo.toml | 2 +- src/{ => cli}/cli.rs | 0 src/{ => cli}/main.rs | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename src/{ => cli}/cli.rs (100%) rename src/{ => cli}/main.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 7cb2a8b..d423f32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ cli = ["dep:needletail", "dep:clap", "dep:rayon", "dep:log", "dep:stderrlog"] [[bin]] name = "sablast" -path = "src/main.rs" +path = "src/cli/main.rs" required-features = ["cli"] [dependencies] diff --git a/src/cli.rs b/src/cli/cli.rs similarity index 100% rename from src/cli.rs rename to src/cli/cli.rs diff --git a/src/main.rs b/src/cli/main.rs similarity index 100% rename from src/main.rs rename to src/cli/main.rs From fde641c3b7bc24c15f4f36afcbf18cb239577b44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 1 Oct 2024 22:36:34 +0300 Subject: [PATCH 04/21] Fix indent --- .github/workflows/build_artifacts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_artifacts.yml b/.github/workflows/build_artifacts.yml index 6e45b97..9bdf339 100644 --- a/.github/workflows/build_artifacts.yml +++ b/.github/workflows/build_artifacts.yml @@ -17,7 +17,7 @@ jobs: ARCHIVE_TYPES: tar.gz ARCHIVE_NAME: sablast-candidate-x86_64-unknown-linux-musl RUSTTARGET: x86_64-unknown-linux-musl - EXTRA_FEATURES: "cli" + EXTRA_FEATURES: "cli" EXTRA_FILES: "COPYRIGHT LICENSE-APACHE LICENSE-MIT README.md" TOOLCHAIN_VERSION: stable-2024-09-05 UPLOAD_MODE: none From 45477b6986d1f21289926e9f6f87961a260ceca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 17 Oct 2024 15:35:50 +0300 Subject: [PATCH 05/21] Add extra warnings. --- src/lib.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 4465e80..b3be58c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,13 @@ // the MIT license, or , // at your option. // +#![warn(missing_docs, + missing_debug_implementations, missing_copy_implementations, + trivial_casts, trivial_numeric_casts, + unsafe_code, + unstable_features, + unused_import_braces, unused_qualifications)] + use sbwt::SbwtIndexVariant; pub mod derandomize; From 6931301804f54cadb996724b8c34d956032af89b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 17 Oct 2024 15:35:59 +0300 Subject: [PATCH 06/21] Document fields and default(). --- src/index.rs | 70 ++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/src/index.rs b/src/index.rs index 6446952..350093a 100644 --- a/src/index.rs +++ b/src/index.rs @@ -23,56 +23,50 @@ use sbwt::SbwtIndexVariant; /// Controls the parameters and resources available to the SBWT construction algorithm. /// -/// Used to specify values for: -/// - _k_-mer size `k`. -/// - Reverse complement input sequences `add_revcomp`. -/// - Number of threads `num_threads` to use. -/// - Size of the precalculated lookup table `prefix_precalc`. -/// - Build select support `build_select` (required for [map]). -/// - RAM available (in GB) to construction algorithm `mem_gb`. -/// - Deduplicate _k_-mer batches in construction algorithm `dedup_batches`. -/// - Temporary directory path `temp_dir`. -/// -/// Note that this struct is marked non_exhaustive, meaning that new -/// public fields may be added in the future. -/// -/// Same as initializing manually with these values: -/// ```rust -/// let mut opts = sablast::index::BuildOpts::default(); -/// opts.k = 31; -/// opts.add_revcomp = false; -/// opts.num_threads = 1; -/// opts.prefix_precalc = 8; -/// opts.build_select = false; -/// opts.mem_gb = 4; -/// opts.dedup_batches = false; -/// opts.temp_dir = None; -/// -/// # let expected = sablast::index::BuildOpts::default(); -/// # assert_eq!(opts.k, expected.k); -/// # assert_eq!(opts.add_revcomp, expected.add_revcomp); -/// # assert_eq!(opts.num_threads, expected.num_threads); -/// # assert_eq!(opts.prefix_precalc, expected.prefix_precalc); -/// # assert_eq!(opts.build_select, expected.build_select); -/// # assert_eq!(opts.mem_gb, expected.mem_gb); -/// # assert_eq!(opts.dedup_batches, expected.dedup_batches); -/// # assert_eq!(opts.temp_dir, expected.temp_dir); -/// ``` -/// -#[derive(Clone)] +#[derive(Clone, Debug)] #[non_exhaustive] pub struct BuildOpts { + /// - _k_-mer size `k`. pub k: usize, + /// - Reverse complement input sequences `add_revcomp`. pub add_revcomp: bool, + /// - Number of threads `num_threads` to use. pub num_threads: usize, + /// - Size of the precalculated lookup table `prefix_precalc`. pub prefix_precalc: usize, + /// - Build select support `build_select` (required for [map]). pub build_select: bool, + /// - RAM available (in GB) to construction algorithm `mem_gb`. pub mem_gb: usize, + /// - Deduplicate _k_-mer batches in construction algorithm `dedup_batches`. pub dedup_batches: bool, + /// - Temporary directory path `temp_dir`. pub temp_dir: Option, } -// Defaults + impl Default for BuildOpts { + /// Default to these values: + /// ```rust + /// let mut opts = sablast::index::BuildOpts::default(); + /// opts.k = 31; + /// opts.add_revcomp = false; + /// opts.num_threads = 1; + /// opts.prefix_precalc = 8; + /// opts.build_select = false; + /// opts.mem_gb = 4; + /// opts.dedup_batches = false; + /// opts.temp_dir = None; + /// # let expected = sablast::index::BuildOpts::default(); + /// # assert_eq!(opts.k, expected.k); + /// # assert_eq!(opts.add_revcomp, expected.add_revcomp); + /// # assert_eq!(opts.num_threads, expected.num_threads); + /// # assert_eq!(opts.prefix_precalc, expected.prefix_precalc); + /// # assert_eq!(opts.build_select, expected.build_select); + /// # assert_eq!(opts.mem_gb, expected.mem_gb); + /// # assert_eq!(opts.dedup_batches, expected.dedup_batches); + /// # assert_eq!(opts.temp_dir, expected.temp_dir); + /// ``` + /// fn default() -> BuildOpts { BuildOpts { k: 31, From 89d6f82b83d1f62e235ccc45d66d3de4c2484736 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 17 Oct 2024 15:38:15 +0300 Subject: [PATCH 07/21] Remove qualifications from sbwt::SbwtIndexVariant. --- src/index.rs | 12 ++++++------ src/lib.rs | 10 +++++----- src/translate.rs | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/index.rs b/src/index.rs index 350093a..d706d36 100644 --- a/src/index.rs +++ b/src/index.rs @@ -112,7 +112,7 @@ impl Default for BuildOpts { pub fn build_sbwt_from_vecs( slices: &[Vec], build_options: &Option, -) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { +) -> (SbwtIndexVariant, sbwt::LcsArray) { assert!(!slices.is_empty()); let build_opts = if build_options.is_some() { build_options.clone().unwrap() } else { BuildOpts::default() }; @@ -164,7 +164,7 @@ pub fn build_sbwt_from_vecs( /// pub fn serialize_sbwt( outfile_prefix: &str, - sbwt: &sbwt::SbwtIndexVariant, + sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, ) { let sbwt_outfile = format!("{}.sbwt", outfile_prefix); @@ -218,9 +218,9 @@ pub fn serialize_sbwt( /// let (sbwt_loaded, lcs_loaded) = load_sbwt(&index_prefix); /// # assert_eq!(lcs, lcs_loaded); /// # match sbwt_loaded { -/// # sbwt::SbwtIndexVariant::SubsetMatrix(ref loaded) => { +/// # SbwtIndexVariant::SubsetMatrix(ref loaded) => { /// # match sbwt_loaded { -/// # sbwt::SbwtIndexVariant::SubsetMatrix(ref built) => { +/// # SbwtIndexVariant::SubsetMatrix(ref built) => { /// # assert_eq!(built, loaded); /// # }, /// # }; @@ -230,7 +230,7 @@ pub fn serialize_sbwt( /// pub fn load_sbwt( index_prefix: &str, -) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { +) -> (SbwtIndexVariant, sbwt::LcsArray) { let indexfile = format!("{}.sbwt", index_prefix); let lcsfile = format!("{}.lcs", index_prefix); @@ -277,7 +277,7 @@ pub fn load_sbwt( /// pub fn query_sbwt( query: &[u8], - sbwt: &sbwt::SbwtIndexVariant, + sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec<(usize, Range)> { assert!(!query.is_empty()); diff --git a/src/lib.rs b/src/lib.rs index b3be58c..baf0338 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,7 +40,7 @@ pub mod translate; /// sequences the alignments are for. /// /// Returns a tuple containing the built -/// [sbwt::SbwtIndexVariant](https://docs.rs/sbwt/latest/sbwt/enum.SbwtIndexVariant.html) +/// [SbwtIndexVariant](https://docs.rs/sbwt/latest/sbwt/enum.SbwtIndexVariant.html) /// and /// [sbwt::LcsArray](https://docs.rs/sbwt/latest/sbwt/struct.LcsArray.html). /// @@ -61,7 +61,7 @@ pub mod translate; pub fn build( seq_data: &[Vec], build_opts: index::BuildOpts, -) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { +) -> (SbwtIndexVariant, sbwt::LcsArray) { index::build_sbwt_from_vecs(seq_data, &Some(build_opts)) } @@ -103,7 +103,7 @@ pub fn build( /// pub fn matches( query_seq: &[u8], - sbwt: &sbwt::SbwtIndexVariant, + sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec { let (k, threshold) = match sbwt { @@ -148,7 +148,7 @@ pub fn matches( /// pub fn map( ref_seq: &[u8], - query_sbwt: &sbwt::SbwtIndexVariant, + query_sbwt: &SbwtIndexVariant, query_lcs: &sbwt::LcsArray, ) -> Vec { let (k, threshold) = match query_sbwt { @@ -200,7 +200,7 @@ pub fn map( /// pub fn find( query_seq: &[u8], - sbwt: &sbwt::SbwtIndexVariant, + sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec<(usize, usize, usize, usize)> { let aln = matches(query_seq, sbwt, lcs); diff --git a/src/translate.rs b/src/translate.rs index 14c138a..4010a7f 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -322,7 +322,7 @@ pub fn translate_ms_vec( /// use sablast::derandomize::random_match_threshold; /// use sablast::translate::translate_ms_vec; /// use sablast::translate::refine_translation; -/// use sbwt::SbwtIndexVariant; +/// use SbwtIndexVariant; /// /// // Parameters : k = 4, threshold = 3 /// // @@ -363,7 +363,7 @@ pub fn translate_ms_vec( pub fn refine_translation( translation: &[char], noisy_ms: &[(usize, Range)], - query_sbwt: &sbwt::SbwtIndexVariant, + query_sbwt: &SbwtIndexVariant, threshold: usize, ) -> Vec { let n_elements = translation.len(); @@ -549,7 +549,7 @@ mod tests { use crate::derandomize::random_match_threshold; use super::translate_ms_vec; use super::refine_translation; - use sbwt::SbwtIndexVariant; + use SbwtIndexVariant; // Parameters : k = 4, threshold = 3 // From 49c33fb7972bd4e71e3720d5551b95b1a82d8023 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 17 Oct 2024 15:51:59 +0300 Subject: [PATCH 08/21] Add qualifications to tests that broke. --- src/index.rs | 4 ++-- src/translate.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/index.rs b/src/index.rs index d706d36..9b2881f 100644 --- a/src/index.rs +++ b/src/index.rs @@ -218,9 +218,9 @@ pub fn serialize_sbwt( /// let (sbwt_loaded, lcs_loaded) = load_sbwt(&index_prefix); /// # assert_eq!(lcs, lcs_loaded); /// # match sbwt_loaded { -/// # SbwtIndexVariant::SubsetMatrix(ref loaded) => { +/// # sbwt::SbwtIndexVariant::SubsetMatrix(ref loaded) => { /// # match sbwt_loaded { -/// # SbwtIndexVariant::SubsetMatrix(ref built) => { +/// # sbwt::SbwtIndexVariant::SubsetMatrix(ref built) => { /// # assert_eq!(built, loaded); /// # }, /// # }; diff --git a/src/translate.rs b/src/translate.rs index 4010a7f..e216ced 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -322,7 +322,7 @@ pub fn translate_ms_vec( /// use sablast::derandomize::random_match_threshold; /// use sablast::translate::translate_ms_vec; /// use sablast::translate::refine_translation; -/// use SbwtIndexVariant; +/// use sbwt::SbwtIndexVariant; /// /// // Parameters : k = 4, threshold = 3 /// // @@ -549,7 +549,7 @@ mod tests { use crate::derandomize::random_match_threshold; use super::translate_ms_vec; use super::refine_translation; - use SbwtIndexVariant; + use sbwt::SbwtIndexVariant; // Parameters : k = 4, threshold = 3 // From 148b55ba08590d5b70ccd3247f734f118f2fa84c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 29 Oct 2024 15:21:22 +0200 Subject: [PATCH 09/21] Fix links in docs. --- src/index.rs | 2 +- src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index.rs b/src/index.rs index 9b2881f..02f919b 100644 --- a/src/index.rs +++ b/src/index.rs @@ -34,7 +34,7 @@ pub struct BuildOpts { pub num_threads: usize, /// - Size of the precalculated lookup table `prefix_precalc`. pub prefix_precalc: usize, - /// - Build select support `build_select` (required for [map]). + /// - Build select support `build_select` (required for [map](super::map)). pub build_select: bool, /// - RAM available (in GB) to construction algorithm `mem_gb`. pub mem_gb: usize, diff --git a/src/lib.rs b/src/lib.rs index baf0338..3378c62 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -169,7 +169,7 @@ pub fn map( /// Finds the _k_-mers from an SBWT index in a query fasta or fastq file. /// /// Aligns the sequence data in `query_seq` against the SBWT index -/// `sbwt` and its LCS array `lcs` using [matches]. Then uses +/// `sbwt` and its LCS array `lcs` using [matches][matches()]. Then uses /// [format::run_lengths] to extract the local alignments from the /// matching statistics. /// From c265018a725df5d138d8045c1f356ccd4a6f5fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 29 Oct 2024 15:22:18 +0200 Subject: [PATCH 10/21] Tabs -> spaces. --- src/format.rs | 58 +++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/src/format.rs b/src/format.rs index 199e781..92947fe 100644 --- a/src/format.rs +++ b/src/format.rs @@ -22,19 +22,19 @@ pub fn run_lengths( let mut i = 0; let mut match_start: bool = false; while i < aln.len() { - match_start = (aln[i] != '-' && aln[i] != ' ') && !match_start; - if match_start { - let start = i; - let mut matches: usize = 0; - while i < aln.len() && (aln[i] != '-' && aln[i] != ' ') { - matches += (aln[i] == 'M' || aln[i] == 'R') as usize; - i += 1; - } - encodings.push((start + 1, i, matches, i - start - matches)); - match_start = false; - } else { - i += 1; - } + match_start = (aln[i] != '-' && aln[i] != ' ') && !match_start; + if match_start { + let start = i; + let mut matches: usize = 0; + while i < aln.len() && (aln[i] != '-' && aln[i] != ' ') { + matches += (aln[i] == 'M' || aln[i] == 'R') as usize; + i += 1; + } + encodings.push((start + 1, i, matches, i - start - matches)); + match_start = false; + } else { + i += 1; + } } encodings } @@ -44,18 +44,18 @@ pub fn relative_to_ref( alignment: &[char], ) -> Vec { ref_seq.iter().zip(alignment.iter()).map(|x| { - if *x.1 == 'M' || *x.1 == 'R' { - *x.0 - } else if *x.1 == 'X' { - // 'X' is an unresolved SNP - b'-' - } else if *x.1 != '-' { - // Other possible values are resolved SNPs (A,C,G,T) - *x.1 as u8 - } else { - b'-' - } - }).collect() + if *x.1 == 'M' || *x.1 == 'R' { + *x.0 + } else if *x.1 == 'X' { + // 'X' is an unresolved SNP + b'-' + } else if *x.1 != '-' { + // Other possible values are resolved SNPs (A,C,G,T) + *x.1 as u8 + } else { + b'-' + } + }).collect() } //////////////////////////////////////////////////////////////////////////////// @@ -65,9 +65,9 @@ pub fn relative_to_ref( mod tests { #[test] fn run_lengths() { - let expected: Vec<(usize, usize, usize, usize)> = vec![(6,33,28,0),(82,207,126,0),(373,423,51,0),(488,512,25,0)]; - let input = vec!['-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; - let got = super::run_lengths(&input); - assert_eq!(got, expected); + let expected: Vec<(usize, usize, usize, usize)> = vec![(6,33,28,0),(82,207,126,0),(373,423,51,0),(488,512,25,0)]; + let input = vec!['-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; + let got = super::run_lengths(&input); + assert_eq!(got, expected); } } From 1d4fc175699e33cc653d7637b4a1fb6e4e4c0639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 29 Oct 2024 15:44:50 +0200 Subject: [PATCH 11/21] Document run_lengths. --- src/format.rs | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/format.rs b/src/format.rs index 92947fe..4c8705b 100644 --- a/src/format.rs +++ b/src/format.rs @@ -13,6 +13,43 @@ // //! Converting alignment representations into various output formats. +/// Extracts run length encodings from a translated alignment. +/// +/// Traverses the character representation of the alignment stored in `aln` and +/// counts the consecutive run lengths of sections that align to the reference. +/// A base is counted as aligned if its character representation is not '-' or ' '. +/// +/// This function can be used for both plain and refined translations. +/// +/// Returns a vector where each element contains 4 elements with the contents: +/// - Start position of a run of consecutive characters (indexing starts from 1). +/// - End position of a run of consecutive characters (indexing starts from 1). +/// - Number matching bases in the run (character representation is 'M' or 'R'). +/// - Number mismatching bases in the run (character representation is not 'M', '-', or ' '). +/// +/// The run length is the sum of the matching and mismatching bases. +/// +/// # Examples +/// ## Extract run lengths from a character representation +/// ```rust +/// use sablast::format::run_lengths; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G +/// // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C +/// // Input MS : 0,1,2,3, 1,2,3,0,1,2,3,-1,0,1,2,3,-1,0 +/// // Translation : X,M,M,R, , ,R,M,M,X,M,M,M, , ,M,M,M, , +/// +/// // Expected output: [(1, 11, 9, 2), +/// // (14, 16, 3, 0)] +/// +/// let input: Vec = vec!['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-']; +/// let run_lengths = run_lengths(&input); +/// # let expected = vec![(1,11,9,2),(14,16,3,0)]; +/// # assert_eq!(run_lengths, expected); +/// ``` +/// pub fn run_lengths( aln: &[char], ) -> Vec<(usize, usize, usize, usize)> { From 9c931661117f6d63123af0e2053b28c08c9d4fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 29 Oct 2024 16:16:27 +0200 Subject: [PATCH 12/21] Partial documentation for format.rs --- src/format.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/format.rs b/src/format.rs index 4c8705b..e22bd2c 100644 --- a/src/format.rs +++ b/src/format.rs @@ -76,6 +76,36 @@ pub fn run_lengths( encodings } +/// Format a refined translation relative to the reference sequence. +/// +/// Jointly reads nucleotides from the reference sequence `ref_seq` and the +/// character representation of an alignment `alignment` to determine the +/// nucleotide sequence of the alignment relative to the reference. +/// +/// Only works with refined translations as the input (TODO verify). +/// +/// Returns a vector containing alignment of the query against `ref_seq` in a +/// gapped alignment format. +/// +/// Valid characters in the return format are: +/// - 'A', 'C', 'G', 'T': the nucleotide in the query sequence. +/// - '-': gap in the query. +/// +/// If `alignment` is a refined translation, the nucleotides ACGT in the return +/// value may differ from the reference sequence and the gaps '-' represent +/// sections absent from the query. +/// +/// If `alignment` is an unrefined translation, the nucleotides ACGT are always +/// the same as in the reference sequence. Gaps '-' may represent either +/// unresolved SNPs, insertions, or absent sections. +/// +/// # Examples +/// ## Format a refined translation +/// +/// TODO Add test to relative_to_ref documentation. +/// +/// ``` +/// pub fn relative_to_ref( ref_seq: &[u8], alignment: &[char], From cf7a786c29103e814520764fdefe82e22a1c69ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 13:53:38 +0200 Subject: [PATCH 13/21] Fix indentation that broke at some point. --- src/cli/main.rs | 270 ++++++++++++++++++++++++------------------------ 1 file changed, 135 insertions(+), 135 deletions(-) diff --git a/src/cli/main.rs b/src/cli/main.rs index a89356e..f548885 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -74,152 +74,152 @@ fn main() { // Subcommands: match &cli.command { Some(cli::Commands::Build { - seq_files, - output_prefix, + seq_files, + output_prefix, kmer_size, - prefix_precalc, - dedup_batches, - num_threads, - mem_gb, - temp_dir, - verbose, + prefix_precalc, + dedup_batches, + num_threads, + mem_gb, + temp_dir, + verbose, }) => { - init_log(if *verbose { 2 } else { 1 }); + init_log(if *verbose { 2 } else { 1 }); let mut sbwt_build_options = sablast::index::BuildOpts::default(); - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); - - info!("Building SBWT index from {} files...", seq_files.len()); - let mut seq_data: Vec> = Vec::new(); - seq_files.iter().for_each(|file| { - seq_data.append(&mut read_fastx_file(file)); - }); - - let (sbwt, lcs) = sablast::build(&seq_data, sbwt_build_options); - - info!("Serializing SBWT index to {}.sbwt ...", output_prefix.as_ref().unwrap()); - info!("Serializing LCS array to {}.lcs ...", output_prefix.as_ref().unwrap()); - sablast::index::serialize_sbwt(output_prefix.as_ref().unwrap(), &sbwt, &lcs); - - }, + sbwt_build_options.k = *kmer_size; + sbwt_build_options.num_threads = *num_threads; + sbwt_build_options.prefix_precalc = *prefix_precalc; + sbwt_build_options.dedup_batches = *dedup_batches; + sbwt_build_options.mem_gb = *mem_gb; + sbwt_build_options.temp_dir = temp_dir.clone(); + + info!("Building SBWT index from {} files...", seq_files.len()); + let mut seq_data: Vec> = Vec::new(); + seq_files.iter().for_each(|file| { + seq_data.append(&mut read_fastx_file(file)); + }); + + let (sbwt, lcs) = sablast::build(&seq_data, sbwt_build_options); + + info!("Serializing SBWT index to {}.sbwt ...", output_prefix.as_ref().unwrap()); + info!("Serializing LCS array to {}.lcs ...", output_prefix.as_ref().unwrap()); + sablast::index::serialize_sbwt(output_prefix.as_ref().unwrap(), &sbwt, &lcs); + + }, Some(cli::Commands::Find { - query_files, - ref_file, - index_prefix, - num_threads, + query_files, + ref_file, + index_prefix, + num_threads, kmer_size, - prefix_precalc, - dedup_batches, - mem_gb, - temp_dir, - verbose, + prefix_precalc, + dedup_batches, + mem_gb, + temp_dir, + verbose, }) => { - init_log(if *verbose { 2 } else { 1 }); + init_log(if *verbose { 2 } else { 1 }); let mut sbwt_build_options = sablast::index::BuildOpts::default(); - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); - - let ((sbwt, lcs), ref_name) = if index_prefix.is_some() && !ref_file.is_some() { - info!("Loading SBWT index..."); - (sablast::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.as_ref().unwrap()) - } else if !index_prefix.is_some() && ref_file.is_some() { - info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); - let ref_data = read_fastx_file(ref_file.as_ref() .unwrap()); - (sablast::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.as_ref().unwrap()) - } else { - panic!("Ambiguous reference, supply only one of `-r/--reference` and `-i/--index`"); - }; - - rayon::ThreadPoolBuilder::new() - .num_threads(*num_threads) - .thread_name(|i| format!("rayon-thread-{}", i)) - .build_global() - .unwrap(); - - info!("Querying SBWT index..."); - println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tin.contig"); - let stdout = std::io::stdout(); - query_files.par_iter().for_each(|file| { - - let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); - while let Some(rec) = reader.next() { - let seqrec = rec.expect("Valid fastX record"); - let contig = seqrec.id(); - let seq = seqrec.normalize(true); - - // Get local alignments for forward strand - let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = sablast::find(&seq, &sbwt, &lcs).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); - - // Add local alignments for reverse _complement - run_lengths.append(&mut sablast::find(&seq.reverse_complement(), &sbwt, &lcs).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); - - // Sort by q.start - run_lengths.sort_by_key(|x| x.0); - - // Print results with query and ref name added - run_lengths.iter().for_each(|x| { - let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", - file, ref_name, x.0, x.1, x.2, x.3, x.4, std::str::from_utf8(contig).expect("UTF-8")); - }); - } - }); - }, + sbwt_build_options.k = *kmer_size; + sbwt_build_options.num_threads = *num_threads; + sbwt_build_options.prefix_precalc = *prefix_precalc; + sbwt_build_options.dedup_batches = *dedup_batches; + sbwt_build_options.mem_gb = *mem_gb; + sbwt_build_options.temp_dir = temp_dir.clone(); + + let ((sbwt, lcs), ref_name) = if index_prefix.is_some() && !ref_file.is_some() { + info!("Loading SBWT index..."); + (sablast::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.as_ref().unwrap()) + } else if !index_prefix.is_some() && ref_file.is_some() { + info!("Building SBWT from file {}...", ref_file.as_ref().unwrap()); + let ref_data = read_fastx_file(ref_file.as_ref() .unwrap()); + (sablast::index::build_sbwt_from_vecs(&ref_data, &Some(sbwt_build_options)), ref_file.as_ref().unwrap()) + } else { + panic!("Ambiguous reference, supply only one of `-r/--reference` and `-i/--index`"); + }; + + rayon::ThreadPoolBuilder::new() + .num_threads(*num_threads) + .thread_name(|i| format!("rayon-thread-{}", i)) + .build_global() + .unwrap(); + + info!("Querying SBWT index..."); + println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tin.contig"); + let stdout = std::io::stdout(); + query_files.par_iter().for_each(|file| { + + let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); + while let Some(rec) = reader.next() { + let seqrec = rec.expect("Valid fastX record"); + let contig = seqrec.id(); + let seq = seqrec.normalize(true); + + // Get local alignments for forward strand + let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = sablast::find(&seq, &sbwt, &lcs).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + + // Add local alignments for reverse _complement + run_lengths.append(&mut sablast::find(&seq.reverse_complement(), &sbwt, &lcs).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); + + // Sort by q.start + run_lengths.sort_by_key(|x| x.0); + + // Print results with query and ref name added + run_lengths.iter().for_each(|x| { + let _ = writeln!(&mut stdout.lock(), + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + file, ref_name, x.0, x.1, x.2, x.3, x.4, std::str::from_utf8(contig).expect("UTF-8")); + }); + } + }); + }, Some(cli::Commands::Map { - query_files, - ref_file, - num_threads, + query_files, + ref_file, + num_threads, kmer_size, - prefix_precalc, - dedup_batches, - mem_gb, - temp_dir, - verbose, + prefix_precalc, + dedup_batches, + mem_gb, + temp_dir, + verbose, }) => { - init_log(if *verbose { 2 } else { 1 }); + init_log(if *verbose { 2 } else { 1 }); let mut sbwt_build_options = sablast::index::BuildOpts::default(); - // These are required for the subcommand to work correctly - sbwt_build_options.add_revcomp = true; - sbwt_build_options.build_select = true; - // These can be adjusted - sbwt_build_options.k = *kmer_size; - sbwt_build_options.num_threads = *num_threads; - sbwt_build_options.prefix_precalc = *prefix_precalc; - sbwt_build_options.dedup_batches = *dedup_batches; - sbwt_build_options.mem_gb = *mem_gb; - sbwt_build_options.temp_dir = temp_dir.clone(); - - rayon::ThreadPoolBuilder::new() - .num_threads(*num_threads) - .thread_name(|i| format!("rayon-thread-{}", i)) - .build_global() - .unwrap(); - - let ref_data = read_fastx_file(ref_file); - - let stdout = std::io::stdout(); - query_files.par_iter().for_each(|query_file| { - let query_data = read_fastx_file(query_file); - let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); - - let mut res: Vec = Vec::new(); - ref_data.iter().for_each(|ref_contig| { - res.append(&mut sablast::map(ref_contig, &sbwt, &lcs)); - }); - - let _ = writeln!(&mut stdout.lock(), - ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); - }); - }, - None => {} + // These are required for the subcommand to work correctly + sbwt_build_options.add_revcomp = true; + sbwt_build_options.build_select = true; + // These can be adjusted + sbwt_build_options.k = *kmer_size; + sbwt_build_options.num_threads = *num_threads; + sbwt_build_options.prefix_precalc = *prefix_precalc; + sbwt_build_options.dedup_batches = *dedup_batches; + sbwt_build_options.mem_gb = *mem_gb; + sbwt_build_options.temp_dir = temp_dir.clone(); + + rayon::ThreadPoolBuilder::new() + .num_threads(*num_threads) + .thread_name(|i| format!("rayon-thread-{}", i)) + .build_global() + .unwrap(); + + let ref_data = read_fastx_file(ref_file); + + let stdout = std::io::stdout(); + query_files.par_iter().for_each(|query_file| { + let query_data = read_fastx_file(query_file); + let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&query_data, &Some(sbwt_build_options.clone())); + + let mut res: Vec = Vec::new(); + ref_data.iter().for_each(|ref_contig| { + res.append(&mut sablast::map(ref_contig, &sbwt, &lcs)); + }); + + let _ = writeln!(&mut stdout.lock(), + ">{}\n{}", query_file, std::str::from_utf8(&res).expect("UTF-8")); + }); + }, + None => {} } } From 8d55543ec9964a7c416b7d5f059dce80b5d1cc62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:07:50 +0200 Subject: [PATCH 14/21] Remove hardcoded max_error_prob, add as an option. --- src/derandomize.rs | 26 ++++++++++++++++++++++++++ src/lib.rs | 18 ++++++++++++------ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index 5e9ab1c..764431b 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -15,6 +15,32 @@ use embed_doc_image::embed_doc_image; +/// Controls parameters of the derandomization algorithm. +/// +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct DerandomizeOpts { + /// Prefix match lengths with probability less than `max_error_prob` to + /// happen at random are considered noise. + pub max_error_prob: f64, +} + +impl Default for DerandomizeOpts { + /// Default to these values: + /// ```rust + /// let mut opts = sablast::derandomize::DerandomizeOpts::default(); + /// opts.max_error_prob = 0.0000001; + /// # let expected = sablast::derandomize::DerandomizeOpts::default(); + /// # assert_eq!(opts.max_error_prob, expected.max_error_prob); + /// ``` + /// + fn default() -> DerandomizeOpts { + DerandomizeOpts { + max_error_prob: 0.0000001, + } + } +} + /// Evaluates the CDF of _k_-bounded matching statistics random match distribution. /// /// Computes the log-probability that a matching statistic with value diff --git a/src/lib.rs b/src/lib.rs index 3378c62..1be7ce9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,6 +88,7 @@ pub fn build( /// use sablast::build; /// use sablast::matches; /// use sablast::index::BuildOpts; +/// use sablast::derandomize::DerandomizeOpts; /// /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -96,7 +97,7 @@ pub fn build( /// /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let ms_vectors = matches(&query, &sbwt, &lcs); +/// let ms_vectors = matches(&query, &sbwt, &lcs, DerandomizeOpts::default()); /// // `ms_vectors` has ['-','-','-','-','-','-','-','-','-','M','M','M','-','-'] /// # assert_eq!(ms_vectors, vec!['-','-','-','-','-','-','-','-','-','M','M','M','-','-']); /// ``` @@ -105,10 +106,11 @@ pub fn matches( query_seq: &[u8], sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, + derand_opts: derandomize::DerandomizeOpts, ) -> Vec { let (k, threshold) = match sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, 0.0000001_f64)) + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, derand_opts.max_error_prob)) }, }; @@ -132,6 +134,7 @@ pub fn matches( /// use sablast::build; /// use sablast::map; /// use sablast::index::BuildOpts; +/// use sablast::derandomize::DerandomizeOpts; /// /// let query: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -141,7 +144,7 @@ pub fn matches( /// /// let reference = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let alignment = map(&reference, &sbwt_query, &lcs_query); +/// let alignment = map(&reference, &sbwt_query, &lcs_query, DerandomizeOpts::default()); /// // `ms_vectors` has [45,45,45,45,45,45,45,45,45,65,71,71,45,45] /// # assert_eq!(alignment, vec![45,45,45,45,45,45,45,45,45,65,71,71,45,45]); /// ``` @@ -150,10 +153,11 @@ pub fn map( ref_seq: &[u8], query_sbwt: &SbwtIndexVariant, query_lcs: &sbwt::LcsArray, + derand_opts: derandomize::DerandomizeOpts, ) -> Vec { let (k, threshold) = match query_sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, 0.0000001_f64)) + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, derand_opts.max_error_prob)) }, }; @@ -185,6 +189,7 @@ pub fn map( /// use sablast::build; /// use sablast::find; /// use sablast::index::BuildOpts; +/// use sablast::derandomize::DerandomizeOpts; /// /// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// let mut opts = BuildOpts::default(); @@ -193,7 +198,7 @@ pub fn map( /// /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let local_alignments = find(&query, &sbwt, &lcs); +/// let local_alignments = find(&query, &sbwt, &lcs, DerandomizeOpts::default()); /// // `local_alignments` has [(10, 12, 3, 0)] /// # assert_eq!(local_alignments, vec![(10, 12, 3, 0)]); /// ``` @@ -202,7 +207,8 @@ pub fn find( query_seq: &[u8], sbwt: &SbwtIndexVariant, lcs: &sbwt::LcsArray, + derand_opts: derandomize::DerandomizeOpts, ) -> Vec<(usize, usize, usize, usize)> { - let aln = matches(query_seq, sbwt, lcs); + let aln = matches(query_seq, sbwt, lcs, derand_opts); format::run_lengths(&aln) } From 9f59cde7a05b02410fa5a05a00935e7352cfe6be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:09:23 +0200 Subject: [PATCH 15/21] Tabs -> spaces --- src/cli/cli.rs | 78 +++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/src/cli/cli.rs b/src/cli/cli.rs index 758f912..9e8cdbd 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -31,112 +31,112 @@ pub enum Commands { #[arg(group = "input", required = true)] seq_files: Vec, - // Outputs + // Outputs #[arg(short = 'o', long = "output-prefix", required = false, help_heading = "Output")] output_prefix: Option, - // Build parameters - // // k-mer size + // Build parameters + // // k-mer size #[arg(short = 'k', default_value_t = 31, help_heading = "Build options")] kmer_size: usize, - // // prefix precalc - #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] + // // prefix precalc + #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] prefix_precalc: usize, - // // deduplicate k-mer batches - #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] + // // deduplicate k-mer batches + #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] dedup_batches: bool, // Resources - // // Threads + // // Threads #[arg(short = 't', long = "threads", default_value_t = 1)] num_threads: usize, - // // Memory in GB + // // Memory in GB #[arg(short = 'm', long = "mem-gb", default_value_t = 4, help_heading = "Build options")] mem_gb: usize, - // // Temporary directory + // // Temporary directory #[arg(long = "temp-dir", required = false, help_heading = "Build options")] temp_dir: Option, - // Verbosity + // Verbosity #[arg(long = "verbose", default_value_t = false)] verbose: bool, }, // Find indexed k-mers in a query Find { - // Input fasta or fastq query file(s) + // Input fasta or fastq query file(s) #[arg(group = "input", required = true)] query_files: Vec, - // Reference - // // Sequence file + // Reference + // // Sequence file #[arg(short = 'r', long = "reference", group = "reference", help_heading = "Input")] ref_file: Option, - // // ... or a prebuilt index + // // ... or a prebuilt index #[arg(short = 'i', long = "index", group = "reference", help_heading = "Input")] index_prefix: Option, - // Resources - // // Threads + // Resources + // // Threads #[arg(short = 't', long = "threads", default_value_t = 1)] num_threads: usize, - // Build parameters - // // k-mer size + // Build parameters + // // k-mer size #[arg(short = 'k', default_value_t = 31, help_heading = "Build options")] kmer_size: usize, - // // prefix precalc - #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] + // // prefix precalc + #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] prefix_precalc: usize, - // // deduplicate k-mer batches - #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] + // // deduplicate k-mer batches + #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] dedup_batches: bool, - // // Memory in GB + // // Memory in GB #[arg(short = 'm', long = "mem-gb", default_value_t = 4, help_heading = "Build options")] mem_gb: usize, - // // Temporary directory + // // Temporary directory #[arg(long = "temp-dir", required = false, help_heading = "Build options")] temp_dir: Option, - // Verbosity + // Verbosity #[arg(long = "verbose", default_value_t = false)] verbose: bool, }, // Map a query or queries to a reference and return the alignment Map { - // Input fasta or fastq query file(s) + // Input fasta or fastq query file(s) #[arg(group = "input", required = true)] query_files: Vec, - // Reference fasta + // Reference fasta #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] ref_file: String, - // Resources - // // Threads + // Resources + // // Threads #[arg(short = 't', long = "threads", default_value_t = 1)] num_threads: usize, - // Build parameters - // // k-mer size + // Build parameters + // // k-mer size #[arg(short = 'k', default_value_t = 31, help_heading = "Build options")] kmer_size: usize, - // // prefix precalc - #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] + // // prefix precalc + #[arg(short = 'p', long = "prefix-precalc", default_value_t = 8, help_heading = "Build options")] prefix_precalc: usize, - // // deduplicate k-mer batches - #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] + // // deduplicate k-mer batches + #[arg(short = 'd', long = "dedup-batches", default_value_t = false, help_heading = "Build options")] dedup_batches: bool, - // // Memory in GB + // // Memory in GB #[arg(short = 'm', long = "mem-gb", default_value_t = 4, help_heading = "Build options")] mem_gb: usize, - // // Temporary directory + // // Temporary directory #[arg(long = "temp-dir", required = false, help_heading = "Build options")] temp_dir: Option, - // Verbosity + // Verbosity #[arg(long = "verbose", default_value_t = false)] verbose: bool, }, From be7cef4c3137a2bf50718c77cb2acd586d5a3759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:15:08 +0200 Subject: [PATCH 16/21] Add max_error_prob to CLI. --- src/cli/cli.rs | 10 ++++++++++ src/cli/main.rs | 14 +++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/cli/cli.rs b/src/cli/cli.rs index 9e8cdbd..7a7d5be 100644 --- a/src/cli/cli.rs +++ b/src/cli/cli.rs @@ -76,6 +76,11 @@ pub enum Commands { #[arg(short = 'i', long = "index", group = "reference", help_heading = "Input")] index_prefix: Option, + // Parameters + // // Upper bound for random match probability + #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] + max_error_prob: f64, + // Resources // // Threads #[arg(short = 't', long = "threads", default_value_t = 1)] @@ -114,6 +119,11 @@ pub enum Commands { #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] ref_file: String, + // Parameters + // // Upper bound for random match probability + #[arg(long = "max-error-prob", default_value_t = 0.0000001, help_heading = "Algorithm")] + max_error_prob: f64, + // Resources // // Threads #[arg(short = 't', long = "threads", default_value_t = 1)] diff --git a/src/cli/main.rs b/src/cli/main.rs index f548885..fdf4b3d 100644 --- a/src/cli/main.rs +++ b/src/cli/main.rs @@ -111,6 +111,7 @@ fn main() { query_files, ref_file, index_prefix, + max_error_prob, num_threads, kmer_size, prefix_precalc, @@ -128,6 +129,9 @@ fn main() { sbwt_build_options.mem_gb = *mem_gb; sbwt_build_options.temp_dir = temp_dir.clone(); + let mut derand_opts = sablast::derandomize::DerandomizeOpts::default(); + derand_opts.max_error_prob = *max_error_prob; + let ((sbwt, lcs), ref_name) = if index_prefix.is_some() && !ref_file.is_some() { info!("Loading SBWT index..."); (sablast::index::load_sbwt(index_prefix.as_ref().unwrap()), index_prefix.as_ref().unwrap()) @@ -157,10 +161,10 @@ fn main() { let seq = seqrec.normalize(true); // Get local alignments for forward strand - let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = sablast::find(&seq, &sbwt, &lcs).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = sablast::find(&seq, &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); // Add local alignments for reverse _complement - run_lengths.append(&mut sablast::find(&seq.reverse_complement(), &sbwt, &lcs).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); + run_lengths.append(&mut sablast::find(&seq.reverse_complement(), &sbwt, &lcs, derand_opts.clone()).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); // Sort by q.start run_lengths.sort_by_key(|x| x.0); @@ -177,6 +181,7 @@ fn main() { Some(cli::Commands::Map { query_files, ref_file, + max_error_prob, num_threads, kmer_size, prefix_precalc, @@ -198,6 +203,9 @@ fn main() { sbwt_build_options.mem_gb = *mem_gb; sbwt_build_options.temp_dir = temp_dir.clone(); + let mut derand_opts = sablast::derandomize::DerandomizeOpts::default(); + derand_opts.max_error_prob = *max_error_prob; + rayon::ThreadPoolBuilder::new() .num_threads(*num_threads) .thread_name(|i| format!("rayon-thread-{}", i)) @@ -213,7 +221,7 @@ fn main() { let mut res: Vec = Vec::new(); ref_data.iter().for_each(|ref_contig| { - res.append(&mut sablast::map(ref_contig, &sbwt, &lcs)); + res.append(&mut sablast::map(ref_contig, &sbwt, &lcs, derand_opts.clone())); }); let _ = writeln!(&mut stdout.lock(), From 2a54855367e85bb6a9525ea45dd711417fc08d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:16:42 +0200 Subject: [PATCH 17/21] Less is more in documentation. --- src/derandomize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index 764431b..af68f6c 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -20,7 +20,7 @@ use embed_doc_image::embed_doc_image; #[derive(Clone, Debug)] #[non_exhaustive] pub struct DerandomizeOpts { - /// Prefix match lengths with probability less than `max_error_prob` to + /// Prefix match lengths with probability higher than `max_error_prob` to /// happen at random are considered noise. pub max_error_prob: f64, } From 7b9d47bd4971a08794f521d9e547e852bf64210e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:17:23 +0200 Subject: [PATCH 18/21] Placeholder documentation for crate. --- src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 1be7ce9..798cd94 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,8 @@ // the MIT license, or , // at your option. // +//! Crate documentation +//! #![warn(missing_docs, missing_debug_implementations, missing_copy_implementations, trivial_casts, trivial_numeric_casts, From 2100b40ce7f03279fb95c673695d4080788456c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:39:09 +0200 Subject: [PATCH 19/21] Bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d423f32..d856535 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sablast" -version = "0.2.0" +version = "0.3.0" edition = "2021" rust-version = "1.77.0" authors = ["Tommi Mäklin "] From dabc1abc687c977524d0e526bdc73fdd7ad23d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:43:18 +0200 Subject: [PATCH 20/21] Build CLI in deployment scripts. --- .github/deploy/build_aarch64-apple-darwin.sh | 2 +- .github/deploy/build_x86_64-apple-darwin.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/deploy/build_aarch64-apple-darwin.sh b/.github/deploy/build_aarch64-apple-darwin.sh index 964f0d0..dd4af11 100644 --- a/.github/deploy/build_aarch64-apple-darwin.sh +++ b/.github/deploy/build_aarch64-apple-darwin.sh @@ -32,7 +32,7 @@ echo "linker = \"aarch64-apple-darwin22-gcc\"" >> .cargo/config.toml export CC="aarch64-apple-darwin22-gcc" export CXX="aarch64-apple-darwin22-g++" -RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --release --target aarch64-apple-darwin +RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --all-features --release --target aarch64-apple-darwin ## gather the stuff to distribute target=sablast-candidate-aarch64-apple-darwin diff --git a/.github/deploy/build_x86_64-apple-darwin.sh b/.github/deploy/build_x86_64-apple-darwin.sh index 346ffff..817c7af 100644 --- a/.github/deploy/build_x86_64-apple-darwin.sh +++ b/.github/deploy/build_x86_64-apple-darwin.sh @@ -32,7 +32,7 @@ echo "linker = \"x86_64-apple-darwin22-gcc\"" >> .cargo/config.toml export CC="x86_64-apple-darwin22-gcc" export CXX="x86_64-apple-darwin22-g++" -RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --release --target x86_64-apple-darwin +RUSTFLAGS='-L /osxcross/SDK/MacOSX13.0.sdk/usr/lib' cargo build --all-features --release --target x86_64-apple-darwin ## gather the stuff to distribute target=sablast-candidate-x86_64-apple-darwin From 43e0335a1eddb8170178cea05fc20cb2247413ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Thu, 31 Oct 2024 14:52:06 +0200 Subject: [PATCH 21/21] Fix sha256sum path --- .github/deploy/build_aarch64-apple-darwin.sh | 4 ++-- .github/deploy/build_x86_64-apple-darwin.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/deploy/build_aarch64-apple-darwin.sh b/.github/deploy/build_aarch64-apple-darwin.sh index dd4af11..a8c99c5 100644 --- a/.github/deploy/build_aarch64-apple-darwin.sh +++ b/.github/deploy/build_aarch64-apple-darwin.sh @@ -45,7 +45,7 @@ cp LICENSE-APACHE $path/ cp LICENSE-MIT $path/ cd /io/tmp tar -zcvf $target.tar.gz $target -sha256sum $target.tar.gz > $target".sha256sum" +sha256sum $target.tar.gz > $target".tar.gz.sha256sum" mv $target.tar.gz /io/ -mv $target".sha256sum" /io/ +mv $target".tar.gz.sha256sum" /io/ cd /io/ diff --git a/.github/deploy/build_x86_64-apple-darwin.sh b/.github/deploy/build_x86_64-apple-darwin.sh index 817c7af..d90e662 100644 --- a/.github/deploy/build_x86_64-apple-darwin.sh +++ b/.github/deploy/build_x86_64-apple-darwin.sh @@ -45,7 +45,7 @@ cp LICENSE-APACHE $path/ cp LICENSE-MIT $path/ cd /io/tmp tar -zcvf $target.tar.gz $target -sha256sum $target.tar.gz > $target".sha256sum" +sha256sum $target.tar.gz > $target".tar.gz.sha256sum" mv $target.tar.gz /io/ -mv $target".sha256sum" /io/ +mv $target".tar.gz.sha256sum" /io/ cd /io/