From d1e7eb31ab5ad7774d08a9d85332559466b3f117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sat, 21 Sep 2024 19:39:46 +0300 Subject: [PATCH 001/103] Need rust >= v1.77.0 to build sbwt --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 9f5104f..86afc89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,7 @@ name = "sablast" version = "0.1.0" edition = "2021" +rust-version = "1.77.0" authors = ["Tommi Mäklin "] description = "Spectral Burrows-Wheeler transform accelerated local alignment search" readme = "README.md" From a92fd0e163b25ee46eb6f37fd654b8b5ea5dc299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sat, 21 Sep 2024 20:52:53 +0300 Subject: [PATCH 002/103] Add integration test for sablast::map. --- tests/data/NZ_CP058217.1_clbS.fna.gz | Bin 0 -> 400 bytes tests/data/clbS.fna.gz | Bin 0 -> 283 bytes tests/map_clbs.rs | 9 +++++++++ 3 files changed, 9 insertions(+) create mode 100644 tests/data/NZ_CP058217.1_clbS.fna.gz create mode 100644 tests/data/clbS.fna.gz create mode 100644 tests/map_clbs.rs diff --git a/tests/data/NZ_CP058217.1_clbS.fna.gz b/tests/data/NZ_CP058217.1_clbS.fna.gz new file mode 100644 index 0000000000000000000000000000000000000000..91d73020e7cba8765982d2000f8c1d4c9d07a9b7 GIT binary patch literal 400 zcmV;B0dM{viwFqC0`F!515R3BLr^d^I5IIeE-_zYY+_R`W^Q2svh$0IcMdQxwJ<~jK@KIMSUFO8YHW-N z$egn1BC0LlNu!@l0!NiFGz~2>VM8V*bYo2*3^fG8GJ{176bB)ZE^DI~K#;Sb02c*e z(6fpH0>OpQqL|wTYi^aWTZ>Xq$DE+bVp2H62`#O*Xi4c(MO>a)jHM&lKy(1gvnYDD zL_w*56cS23XN6-3)M7JPBuW%Rj0sjzI;I6EKpgI6&2*KNFsWfMNnFJ=U@MIvAZ44J uHY2G)1+lp`w<4ZgO%*L5U^xV`l{sAJBPvX~UkJJ8bVZPAj_rHY)&Vk{lW2BHH{02W16TjKLr z1*DKrsPEHOojR#w41OX}AM4OS+ hpn}-kx)TAAtEr-|fPm!?$W~_v-xn3`B8+kZ006^ecntsm literal 0 HcmV?d00001 diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs new file mode 100644 index 0000000..fb3543f --- /dev/null +++ b/tests/map_clbs.rs @@ -0,0 +1,9 @@ +#[test] +fn map_nissle_against_clbs() { + let (sbwt, lcs) = sablast::index::build_sbwt(&"tests/data/clbS.fna.gz".to_string(), &None); + + let expected = vec![(455, 967, 512, 1, '+'),(996, 1001, 4, 2, '+'),(998, 1000, 3, 0, '-')]; + let got = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt::SbwtIndexVariant::SubsetMatrix(sbwt), &lcs.unwrap()); + + assert_eq!(got, expected); +} From 240776cd6f9af460d4d65107fc136ae7059addb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 14:07:50 +0300 Subject: [PATCH 003/103] Remove overlapping cases from ms_to_run, write documentation. --- src/map.rs | 54 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/src/map.rs b/src/map.rs index 4447d07..aa10197 100644 --- a/src/map.rs +++ b/src/map.rs @@ -40,30 +40,42 @@ pub fn random_match_threshold( return k; } -fn ms_to_run( - curr: usize, - next: usize, +/// Returns the derandomized _k_-bounded matching statistic (a "run"). +/// +/// Derandomizes the `current_ms` matching statistic (MS) based on the +/// `next_run` value obtained from the output of this function for the +/// next MS when read left-to-right, the _k_-mer size `k`, and the +/// `threshold` which specifies a lower bound to consider the MS a +/// non-random match. +/// +/// Positive values of the output i64 value mean that i64 bases from +/// the beginning of the k-mer match the reference, ie. same as the +/// MS, while negative values denote distance from the last base in +/// the last _k_-mer that produced a match. +/// +/// # Examples +/// +/// TODO Add examples to ms_to_run documentation +/// +pub fn ms_to_run( + current_ms: usize, next_run: i64, threshold: usize, k: usize, ) -> i64 { - let run: i64 = if curr == k && next == k { - k as i64 - } else if curr == k && next_run == 1 { - k as i64 - } else if curr == k && next_run < 0 { - k as i64 - } else if curr < threshold { - next_run - 1 - } else if curr > threshold && next_run <= 0 { - curr as i64 - } else if curr > threshold && next_run == 1 { - curr as i64 - } else if curr > threshold && next_run < curr as i64 { - curr as i64 - } else { - next_run - 1 - }; + // Default is to decrease MS by 1. + let mut run: i64 = next_run - 1; + + if current_ms == k { + // Beginning of a full k-mer match + run = k as i64; + } + + if current_ms > threshold && next_run < current_ms as i64 { + // Beginning of a partial k-mer match + // Only works if threshold > 1 + run = current_ms as i64; + } return run; } @@ -123,7 +135,7 @@ pub fn derandomize_ms( // Traverse the matching statistics in reverse runs[len - 1] = ms[len - 1] as i64; for i in 2..len { - runs[len - i] = ms_to_run(ms[len - i], ms[len - i + 1], runs[len - i + 1], params.threshold, params.k); + runs[len - i] = ms_to_run(ms[len - i], runs[len - i + 1], params.threshold, params.k); } return runs; From 972f1d8e592145352d92dd66d4f7730a48d3f2ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 15:05:52 +0300 Subject: [PATCH 004/103] Add docs to random_match_threshold --- src/map.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/map.rs b/src/map.rs index aa10197..6ab6a64 100644 --- a/src/map.rs +++ b/src/map.rs @@ -26,6 +26,26 @@ fn log_rm_max_cdf( n_kmers as f64 * (- ((1.0_f64.ln() - (alphabet_size as f64).ln()).exp()).powi(t as i32 + 1)).ln_1p() } +/// Determines a lower bound for non-random matching statistic values. +/// +/// Computes the probabilities that the possible values for the +/// _k_-bounded matching statistics (MS) of a _k_-mer with size `k` +/// mapped against an index with `n_kmers` total _k_-mers and +/// `alphabet_size` possible values at each character are random +/// matches. Computation terminates when the MS value that produces a +/// random match probability below `max_error_prob` is found and +/// returned. +/// +/// If no MS value passes the check, the function returns `k` instead. +/// +/// # Examples +/// TODO Add examples to random_match_threshold documentation +/// +/// # Distribution of random matches in _k_-bounded matching statistics +/// TODO Add the maths +/// +/// Credit to Jarno N. Alanko for calculating the random match distribution. +/// pub fn random_match_threshold( k: usize, n_kmers: usize, From 4895729324234c7fb35f4127468efad0aabac082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 15:06:10 +0300 Subject: [PATCH 005/103] Add bounds checking --- src/map.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/map.rs b/src/map.rs index 6ab6a64..5207091 100644 --- a/src/map.rs +++ b/src/map.rs @@ -23,6 +23,9 @@ fn log_rm_max_cdf( alphabet_size: usize, n_kmers: usize, ) -> f64 { + assert!(n_kmers > 0); + assert!(alphabet_size > 0); + n_kmers as f64 * (- ((1.0_f64.ln() - (alphabet_size as f64).ln()).exp()).powi(t as i32 + 1)).ln_1p() } @@ -52,6 +55,12 @@ pub fn random_match_threshold( alphabet_size: usize, max_error_prob: f64, ) -> usize { + assert!(k > 0); + assert!(n_kmers > 0); + assert!(alphabet_size > 0); + assert!(max_error_prob <= 1 as f64); + assert!(max_error_prob > 0 as f64); + for i in 1..k { if log_rm_max_cdf(i, alphabet_size, n_kmers) > (-max_error_prob).ln_1p() { return i; @@ -83,6 +92,10 @@ pub fn ms_to_run( threshold: usize, k: usize, ) -> i64 { + assert!(threshold > 1); + assert!(current_ms <= k); + assert!(next_run <= k as i64); + // Default is to decrease MS by 1. let mut run: i64 = next_run - 1; From 09f1fc053d1fcb83eba63be7c969502e94b2a114 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 15:06:39 +0300 Subject: [PATCH 006/103] Characters -> bases in documentation. --- src/map.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/map.rs b/src/map.rs index 5207091..ca582a4 100644 --- a/src/map.rs +++ b/src/map.rs @@ -77,10 +77,10 @@ pub fn random_match_threshold( /// `threshold` which specifies a lower bound to consider the MS a /// non-random match. /// -/// Positive values of the output i64 value mean that i64 bases from -/// the beginning of the k-mer match the reference, ie. same as the -/// MS, while negative values denote distance from the last base in -/// the last _k_-mer that produced a match. +/// Positive values of the output i64 value mean that i64 characters +/// from the beginning of the k-mer match the reference, ie. same as +/// the MS, while negative values denote distance from the last +/// character in the last _k_-mer that produced a match. /// /// # Examples /// From ff2847b58b24c1ddea16c23f57250bdd4e41880a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 15:08:15 +0300 Subject: [PATCH 007/103] Add k-bounded. --- src/map.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/map.rs b/src/map.rs index ca582a4..516dcb6 100644 --- a/src/map.rs +++ b/src/map.rs @@ -29,7 +29,7 @@ fn log_rm_max_cdf( n_kmers as f64 * (- ((1.0_f64.ln() - (alphabet_size as f64).ln()).exp()).powi(t as i32 + 1)).ln_1p() } -/// Determines a lower bound for non-random matching statistic values. +/// Determines a lower bound for non-random _k_-bounded matching statistic values. /// /// Computes the probabilities that the possible values for the /// _k_-bounded matching statistics (MS) of a _k_-mer with size `k` From 24efa30f3916f3abaac40c5dbca2afc6f7dc3b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 19:06:20 +0300 Subject: [PATCH 008/103] Clarify documentation --- src/map.rs | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/map.rs b/src/map.rs index 516dcb6..3a77e02 100644 --- a/src/map.rs +++ b/src/map.rs @@ -18,7 +18,22 @@ pub struct TranslateParams { pub threshold: usize, } -fn log_rm_max_cdf( +/// Evaluates the CDF of _k_-bounded matching statistics random match distribution. +/// +/// Computes the log-probability that a matching statistic with value +/// `t` or less that is the result of mapping a _k_-mer with +/// `alphabet_size` possible characters against an index containing +/// `n_kmers` _k_-mers was generated by chance. +/// +/// # Examples +/// TODO Add examples to log_rm_max_cdf +/// +/// # Distribution of random matches in _k_-bounded matching statistics +/// TODO Add the maths +/// +/// Credit to Jarno N. Alanko for deriving the random match distribution. +/// +pub fn log_rm_max_cdf( t: usize, alphabet_size: usize, n_kmers: usize, @@ -44,11 +59,6 @@ fn log_rm_max_cdf( /// # Examples /// TODO Add examples to random_match_threshold documentation /// -/// # Distribution of random matches in _k_-bounded matching statistics -/// TODO Add the maths -/// -/// Credit to Jarno N. Alanko for calculating the random match distribution. -/// pub fn random_match_threshold( k: usize, n_kmers: usize, @@ -69,12 +79,12 @@ pub fn random_match_threshold( return k; } -/// Returns the derandomized _k_-bounded matching statistic (a "run"). +/// Derandomizes a noisy _k_-bounded matching statistic. /// /// Derandomizes the `current_ms` matching statistic (MS) based on the /// `next_run` value obtained from the output of this function for the -/// next MS when read left-to-right, the _k_-mer size `k`, and the -/// `threshold` which specifies a lower bound to consider the MS a +/// next noisy MS when read left-to-right, the _k_-mer size `k`, and +/// the `threshold` which specifies a lower bound to consider the MS a /// non-random match. /// /// Positive values of the output i64 value mean that i64 characters From 529f9c29763d3ee519ed7297a637a2bea94e094f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 19:10:51 +0300 Subject: [PATCH 009/103] Merge run_to_aln and translate_runs. --- src/map.rs | 86 +++++++++++++++++++++++++----------------------------- 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/src/map.rs b/src/map.rs index 3a77e02..6d0b7e8 100644 --- a/src/map.rs +++ b/src/map.rs @@ -123,49 +123,6 @@ pub fn ms_to_run( return run; } -fn run_to_aln( - runs: &Vec, - curr_ms: usize, - threshold: usize, - k: usize, - res: &mut Vec, - pos: &mut usize, -) { - let prev: i64 = runs[*pos - 1]; - let curr: i64 = runs[*pos]; - let next: i64 = runs[*pos + 1]; - - if curr == k as i64 && next == k as i64 { - res[*pos] = 'M'; - } else if curr > threshold as i64 && (next > 0 && next < threshold as i64) { - res[*pos] = 'R'; - res[*pos + 1] = 'R'; - } else if next == 1 && curr == curr_ms as i64 { - res[*pos] = 'M'; - } else if curr > threshold as i64 { - res[*pos] = 'M'; - } else if curr == next - 1 && curr > 0 { - res[*pos] = 'M'; - } else if curr == 0 && next == 1 && prev > 0 { - res[*pos] = 'X'; - res[*pos - 1] = 'M'; - } else if curr == 0 && next == 1 && prev == -1 { - let mut next_gap: usize = pos.clone(); - let mut gap_len: usize = 0; - while runs[next_gap - 1] < 0 && next_gap > 1 { - gap_len += 1; - next_gap -= 1; - } - // TODO Determine what counts as an insertion or gap in run_to_aln - while *pos < *pos + gap_len && *pos < runs.len() { - res[*pos] = if gap_len > 29 { '-' } else { 'I' }; - *pos += 1; - } - } else { - res[*pos] = ' '; - }; -} - pub fn derandomize_ms( ms: &Vec, params_in: &Option @@ -191,15 +148,50 @@ pub fn translate_runs( params_in: &Option, ) -> Vec { let params = params_in.clone().unwrap(); + let threshold = params.threshold; + let k = params.k; let len = runs.len(); - let mut aln = vec![' '; len]; + let mut res = vec![' '; len]; // Traverse the runs - for mut i in 3..(len - 1) { - run_to_aln(&runs, ms[i], params.threshold, params.k, &mut aln, &mut i); + for mut pos in 3..(len - 1) { + let prev: i64 = runs[pos - 1]; + let curr: i64 = runs[pos]; + let next: i64 = runs[pos + 1]; + let curr_ms = ms[pos]; + + if curr == k as i64 && next == k as i64 { + res[pos] = 'M'; + } else if curr > threshold as i64 && (next > 0 && next < threshold as i64) { + res[pos] = 'R'; + res[pos + 1] = 'R'; + } else if next == 1 && curr == curr_ms as i64 { + res[pos] = 'M'; + } else if curr > threshold as i64 { + res[pos] = 'M'; + } else if curr == next - 1 && curr > 0 { + res[pos] = 'M'; + } else if curr == 0 && next == 1 && prev > 0 { + res[pos] = 'X'; + res[pos - 1] = 'M'; + } else if curr == 0 && next == 1 && prev == -1 { + let mut next_gap: usize = pos.clone(); + let mut gap_len: usize = 0; + while runs[next_gap - 1] < 0 && next_gap > 1 { + gap_len += 1; + next_gap -= 1; + } + // TODO Determine what counts as an insertion or gap in run_to_aln + while pos < pos + gap_len && pos < runs.len() { + res[pos] = if gap_len > 29 { '-' } else { 'I' }; + pos += 1; + } + } else { + res[pos] = ' '; + }; } - return aln; + return res; } pub fn run_lengths( From 280e4a351b9fdaed0538ef8222e38c3935da6af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:01:44 +0300 Subject: [PATCH 010/103] Update test values. --- tests/map_clbs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index fb3543f..adfc42a 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -2,7 +2,7 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt(&"tests/data/clbS.fna.gz".to_string(), &None); - let expected = vec![(455, 967, 512, 1, '+'),(996, 1001, 4, 2, '+'),(998, 1000, 3, 0, '-')]; + let expected = vec![(455, 967, 512, 1, '+'),(997, 1001, 5, 0, '+'),(998, 1001, 4, 0, '-')]; let got = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt::SbwtIndexVariant::SubsetMatrix(sbwt), &lcs.unwrap()); assert_eq!(got, expected); From 9b66150621167c97384b284cf48341a0941802b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:02:31 +0300 Subject: [PATCH 011/103] Rewrite translate_runs and re-enable its test. --- src/map.rs | 87 +++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/src/map.rs b/src/map.rs index 6d0b7e8..1ad039f 100644 --- a/src/map.rs +++ b/src/map.rs @@ -141,7 +141,7 @@ pub fn derandomize_ms( return runs; } - +/// Converts a derandomized k-bounded matching statistics vec to an alignment vec. pub fn translate_runs( ms: &Vec, runs: &Vec, @@ -150,45 +150,53 @@ pub fn translate_runs( let params = params_in.clone().unwrap(); let threshold = params.threshold; let k = params.k; + + assert!(k > 0); + assert!(threshold > 1); + assert!(ms.len() == runs.len()); + assert!(runs.len() > 2); + let len = runs.len(); let mut res = vec![' '; len]; // Traverse the runs - for mut pos in 3..(len - 1) { - let prev: i64 = runs[pos - 1]; + for mut pos in 0..len { + let prev: i64 = if pos > 1 { runs[pos - 1] } else { 31 }; let curr: i64 = runs[pos]; - let next: i64 = runs[pos + 1]; + let next: i64 = if pos < len - 1 { runs[pos + 1] } else { runs[pos] }; let curr_ms = ms[pos]; - if curr == k as i64 && next == k as i64 { - res[pos] = 'M'; - } else if curr > threshold as i64 && (next > 0 && next < threshold as i64) { - res[pos] = 'R'; - res[pos + 1] = 'R'; - } else if next == 1 && curr == curr_ms as i64 { - res[pos] = 'M'; - } else if curr > threshold as i64 { - res[pos] = 'M'; - } else if curr == next - 1 && curr > 0 { - res[pos] = 'M'; - } else if curr == 0 && next == 1 && prev > 0 { - res[pos] = 'X'; - res[pos - 1] = 'M'; - } else if curr == 0 && next == 1 && prev == -1 { - let mut next_gap: usize = pos.clone(); - let mut gap_len: usize = 0; - while runs[next_gap - 1] < 0 && next_gap > 1 { - gap_len += 1; - next_gap -= 1; - } - // TODO Determine what counts as an insertion or gap in run_to_aln - while pos < pos + gap_len && pos < runs.len() { - res[pos] = if gap_len > 29 { '-' } else { 'I' }; - pos += 1; + let mut aln_curr = res[pos]; + let mut aln_next = if pos + 1 < len - 1 { res[pos + 1] } else { 'M' }; + + if curr > threshold as i64 && next > 0 && next < threshold as i64 { + // Current position is first character in a jump to another k-mer, + // or there is deletion of unknown length in the query wrt. the reference. + // + // Use two consecutive 'R's to denote breakpoint between two k-mers + aln_curr = 'R'; + aln_next = 'R'; + } else if curr <= 0 { + // Start of a mismatch region + if next == 1 && prev > 0 { + // Mismatched character or insertion of 1 character in the query. + // + // Use 'X' for mismatch or 1 character insert + aln_curr = 'X'; + } else { + // Insertion of more than 1 characters in the query + // + // Use '-' to denote inserts of more than 1 characters + aln_curr = '-'; } } else { - res[pos] = ' '; - }; + // Other values are always a match, use 'M' for these + aln_curr = 'M'; + } + res[pos] = aln_curr; + if pos + 1 < len - 1 { + res[pos + 1] = aln_next; + } } return res; @@ -259,15 +267,14 @@ mod tests { assert_eq!(got, expected); } - // TODO Fix test for translate_runs - // #[test] - // fn translate_runs() { - // let expected = vec!['-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; - // let input_ms: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - // let input_runs: Vec = vec![0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - // let got = super::translate_runs(&input_ms, &input_runs, &Some(super::TranslateParams{ k: 31, threshold: 22 })); - // assert_eq!(got, expected); - // } + #[test] + fn translate_runs() { + let expected = vec!['M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; + let input_ms: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; + let input_runs: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; + let got = super::translate_runs(&input_ms, &input_runs, &Some(super::TranslateParams{ k: 31, threshold: 22 })); + assert_eq!(got, expected); + } #[test] fn run_lengths() { From 9ddf0d6832abb39c825182172595d97da998bb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:23:03 +0300 Subject: [PATCH 012/103] Use slice arguments in translate_runs --- src/map.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/map.rs b/src/map.rs index 1ad039f..86f737f 100644 --- a/src/map.rs +++ b/src/map.rs @@ -143,8 +143,8 @@ pub fn derandomize_ms( /// Converts a derandomized k-bounded matching statistics vec to an alignment vec. pub fn translate_runs( - ms: &Vec, - runs: &Vec, + ms: &[usize], + runs: &[i64], params_in: &Option, ) -> Vec { let params = params_in.clone().unwrap(); From eb45d4f8c66a3ce48657a23b5b8d4fa4dc9565f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:23:31 +0300 Subject: [PATCH 013/103] More documentation. --- src/map.rs | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/src/map.rs b/src/map.rs index 86f737f..b4f040c 100644 --- a/src/map.rs +++ b/src/map.rs @@ -79,7 +79,7 @@ pub fn random_match_threshold( return k; } -/// Derandomizes a noisy _k_-bounded matching statistic. +/// Derandomizes a single noisy _k_-bounded matching statistic. /// /// Derandomizes the `current_ms` matching statistic (MS) based on the /// `next_run` value obtained from the output of this function for the @@ -102,6 +102,7 @@ pub fn ms_to_run( threshold: usize, k: usize, ) -> i64 { + assert!(k > 0); assert!(threshold > 1); assert!(current_ms <= k); assert!(next_run <= k as i64); @@ -123,16 +124,30 @@ pub fn ms_to_run( return run; } +/// Derandomizes a sequence of noisy _k_-bounded matching statistics. +/// +/// Iterates over a sequence of noisy _k_bounded matching statistics +/// `ms` in reverse to identify values that are the result of random +/// matching between _k_-mers of size `k` and an index that the lower +/// bound `threshold` was calculated for. +/// +/// # Examples +/// TODO Add examples to derandomize_ms documentation +/// pub fn derandomize_ms( - ms: &Vec, + ms: &[usize], params_in: &Option ) -> Vec { let params = params_in.clone().unwrap(); let len = ms.len(); + assert!(params.k > 0); + assert!(params.threshold > 1); + assert!(ms.len() > 2); + let mut runs: Vec = vec![0; len]; - // Traverse the matching statistics in reverse + // Traverse the matching statistics in reverse. runs[len - 1] = ms[len - 1] as i64; for i in 2..len { runs[len - i] = ms_to_run(ms[len - i], runs[len - i + 1], params.threshold, params.k); @@ -141,7 +156,25 @@ pub fn derandomize_ms( return runs; } -/// Converts a derandomized k-bounded matching statistics vec to an alignment vec. +/// Converts a derandomized _k_-bounded matching statistics vec to an alignment vec. +/// +/// Iterates over a derandomized sequence of _k_bounded matching +/// statistics `runs` and creates a sequence containing a character +/// representation of the underlying alignment that generated `runs`. +/// +/// The alignment is encoded using the following characters: +/// - **M** : Match between query and reference. +/// - **-** : Characters in the query that are not found in the reference. +/// - **X** : Single character mismatch or insertion into the query. +/// - **R** : Two consecutive 'R's signify a discontinuity in the alignment. +/// The right 'R' is at the start of a _k_-mer that is not adjacent +/// to the last character in the _k_-mer corresponding to the left +/// 'R'. This implies either a deletion of unknown length in the query, +/// or insertion of _k_-mers from elsewhere in the reference into the query. +/// +/// # Examples +/// TODO Add examples to translate_runs documentation. +/// pub fn translate_runs( ms: &[usize], runs: &[i64], From 542b723dae320558e31dcbc28bcf2154ddcbd702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:33:51 +0300 Subject: [PATCH 014/103] Create new module format for formatting results and move run_lengths. --- src/format.rs | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 8 +++----- src/main.rs | 6 +++++- src/map.rs | 34 --------------------------------- 4 files changed, 60 insertions(+), 40 deletions(-) create mode 100644 src/format.rs diff --git a/src/format.rs b/src/format.rs new file mode 100644 index 0000000..1bb2b61 --- /dev/null +++ b/src/format.rs @@ -0,0 +1,52 @@ +// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// +// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. + +// Copyrights in this project are retained by contributors. No copyright assignment +// is required to contribute to this project. + +// Except as otherwise noted (below and/or in individual files), this +// project is licensed under the Apache License, Version 2.0 +// or or +// the MIT license, or , +// at your option. +// +pub fn run_lengths( + aln: &Vec, +) -> Vec<(usize, usize, usize, usize)> { + // Store run lengths as Vec<(start, end, matches, mismatches)> + let mut encodings: Vec<(usize, usize, usize, usize)> = Vec::new(); + + let mut i = 0; + let mut match_start: bool = false; + while i < aln.len() { + match_start = (aln[i] != '-' && aln[i] != ' ') && !match_start; + if match_start { + let start = i; + let mut matches: usize = 0; + while i < aln.len() && (aln[i] != '-' && aln[i] != ' ') { + matches += (aln[i] == 'M' || aln[i] == 'R') as usize; + i += 1; + } + encodings.push((start + 1, i, matches, i - start - matches)); + match_start = false; + } else { + i += 1; + } + } + return encodings; +} + +//////////////////////////////////////////////////////////////////////////////// +// Tests +// +#[cfg(test)] +mod tests { + #[test] + fn run_lengths() { + let expected: Vec<(usize, usize, usize, usize)> = vec![(6,33,28,0),(82,207,126,0),(373,423,51,0),(488,512,25,0)]; + let input = vec!['-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; + let got = super::run_lengths(&input); + assert_eq!(got, expected); + } +} diff --git a/src/lib.rs b/src/lib.rs index ac0ba15..de795ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,12 +16,13 @@ use sbwt::SbwtIndexVariant; pub mod index; pub mod map; +pub mod format; pub fn map( query_file: &String, sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, -) -> Vec<(usize, usize, usize, usize, char)> { +) -> (Vec, Vec) { let (k, threshold) = match sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { (sbwt.k(), map::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4 as usize, 0.0000001 as f64)) @@ -39,9 +40,6 @@ pub fn map( map::derandomize_ms(&ms_rev, &Some(translate_params.clone()))); let aln = (map::translate_runs(&ms_fw, &runs.0, &Some(translate_params.clone())), map::translate_runs(&ms_rev, &runs.1, &Some(translate_params))); - let mut run_lengths: Vec<(usize, usize, usize, usize, char)> = map::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); - let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = map::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); - run_lengths.append(&mut run_lengths_rev); - return run_lengths; + return aln; } diff --git a/src/main.rs b/src/main.rs index 56b21ca..bfc0883 100644 --- a/src/main.rs +++ b/src/main.rs @@ -81,7 +81,11 @@ fn main() { // TODO Query multiple inputs in sablast map info!("Querying SBWT index..."); - let mut run_lengths = sablast::map(&seq_files[0], &sbwt, &lcs); + let aln = sablast::map(&seq_files[0], &sbwt, &lcs); + let mut run_lengths: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); + let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); + run_lengths.append(&mut run_lengths_rev); + run_lengths.sort_by_key(|x| x.0); println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches"); diff --git a/src/map.rs b/src/map.rs index b4f040c..712d24f 100644 --- a/src/map.rs +++ b/src/map.rs @@ -235,32 +235,6 @@ pub fn translate_runs( return res; } -pub fn run_lengths( - aln: &Vec, -) -> Vec<(usize, usize, usize, usize)> { - // Store run lengths as Vec<(start, end, matches, mismatches)> - let mut encodings: Vec<(usize, usize, usize, usize)> = Vec::new(); - - let mut i = 0; - let mut match_start: bool = false; - while i < aln.len() { - match_start = (aln[i] != '-' && aln[i] != ' ') && !match_start; - if match_start { - let start = i; - let mut matches: usize = 0; - while i < aln.len() && (aln[i] != '-' && aln[i] != ' ') { - matches += (aln[i] == 'M' || aln[i] == 'R') as usize; - i += 1; - } - encodings.push((start + 1, i, matches, i - start - matches)); - match_start = false; - } else { - i += 1; - } - } - return encodings; -} - //////////////////////////////////////////////////////////////////////////////// // Tests // @@ -308,12 +282,4 @@ mod tests { let got = super::translate_runs(&input_ms, &input_runs, &Some(super::TranslateParams{ k: 31, threshold: 22 })); assert_eq!(got, expected); } - - #[test] - fn run_lengths() { - let expected: Vec<(usize, usize, usize, usize)> = vec![(6,33,28,0),(82,207,126,0),(373,423,51,0),(488,512,25,0)]; - let input = vec!['-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; - let got = super::run_lengths(&input); - assert_eq!(got, expected); - } } From cee4b2ac5b942cbd05d4244f977fa5953615f21f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:34:37 +0300 Subject: [PATCH 015/103] Update integration test map_clbs.rs --- tests/map_clbs.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index adfc42a..f6e4255 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -1,9 +1,26 @@ +// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// +// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. + +// Copyrights in this project are retained by contributors. No copyright assignment +// is required to contribute to this project. + +// Except as otherwise noted (below and/or in individual files), this +// project is licensed under the Apache License, Version 2.0 +// or or +// the MIT license, or , +// at your option. +// #[test] fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt(&"tests/data/clbS.fna.gz".to_string(), &None); let expected = vec![(455, 967, 512, 1, '+'),(997, 1001, 5, 0, '+'),(998, 1001, 4, 0, '-')]; - let got = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt::SbwtIndexVariant::SubsetMatrix(sbwt), &lcs.unwrap()); + let aln = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt::SbwtIndexVariant::SubsetMatrix(sbwt), &lcs.unwrap()); + + let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); + let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); + got.append(&mut run_lengths_rev); assert_eq!(got, expected); } From 2c4b016f1d2a01ad1107b11b030ebb771ff078e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:39:01 +0300 Subject: [PATCH 016/103] Remove use of TranslateParams, replace with access to k, threshold. --- src/lib.rs | 10 ++++------ src/map.rs | 32 +++++++++++--------------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index de795ef..d74be81 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,18 +28,16 @@ pub fn map( (sbwt.k(), map::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4 as usize, 0.0000001 as f64)) }, }; - let translate_params = map::TranslateParams { k: k, threshold: threshold }; - // TODO handle multiple files and `input_list` let ms = index::query_sbwt(&query_file, &sbwt, &lcs); info!("Translating result..."); let ms_fw = ms.iter().map(|x| x.0).collect::>(); let ms_rev = ms.iter().map(|x| x.1).collect::>(); - let runs = (map::derandomize_ms(&ms_fw, &Some(translate_params.clone())), - map::derandomize_ms(&ms_rev, &Some(translate_params.clone()))); - let aln = (map::translate_runs(&ms_fw, &runs.0, &Some(translate_params.clone())), - map::translate_runs(&ms_rev, &runs.1, &Some(translate_params))); + let runs = (map::derandomize_ms(&ms_fw, k, threshold), + map::derandomize_ms(&ms_rev, k, threshold)); + let aln = (map::translate_runs(&ms_fw, &runs.0, k, threshold), + map::translate_runs(&ms_rev, &runs.1, k, threshold)); return aln; } diff --git a/src/map.rs b/src/map.rs index 712d24f..e05aab9 100644 --- a/src/map.rs +++ b/src/map.rs @@ -11,13 +11,6 @@ // the MIT license, or , // at your option. // -// Parameters for SBWT construction -#[derive(Clone)] -pub struct TranslateParams { - pub k: usize, - pub threshold: usize, -} - /// Evaluates the CDF of _k_-bounded matching statistics random match distribution. /// /// Computes the log-probability that a matching statistic with value @@ -136,21 +129,21 @@ pub fn ms_to_run( /// pub fn derandomize_ms( ms: &[usize], - params_in: &Option + k: usize, + threshold: usize, ) -> Vec { - let params = params_in.clone().unwrap(); - let len = ms.len(); - - assert!(params.k > 0); - assert!(params.threshold > 1); + assert!(k > 0); + assert!(threshold > 1); assert!(ms.len() > 2); + let len = ms.len(); + let mut runs: Vec = vec![0; len]; // Traverse the matching statistics in reverse. runs[len - 1] = ms[len - 1] as i64; for i in 2..len { - runs[len - i] = ms_to_run(ms[len - i], runs[len - i + 1], params.threshold, params.k); + runs[len - i] = ms_to_run(ms[len - i], runs[len - i + 1], threshold, k); } return runs; @@ -178,12 +171,9 @@ pub fn derandomize_ms( pub fn translate_runs( ms: &[usize], runs: &[i64], - params_in: &Option, + k: usize, + threshold: usize, ) -> Vec { - let params = params_in.clone().unwrap(); - let threshold = params.threshold; - let k = params.k; - assert!(k > 0); assert!(threshold > 1); assert!(ms.len() == runs.len()); @@ -270,7 +260,7 @@ mod tests { let input = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; let expected = vec![0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let got = super::derandomize_ms(&input, &Some(super::TranslateParams{ k: 31, threshold: 22 })); + let got = super::derandomize_ms(&input, 31, 22); assert_eq!(got, expected); } @@ -279,7 +269,7 @@ mod tests { let expected = vec!['M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; let input_ms: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; let input_runs: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let got = super::translate_runs(&input_ms, &input_runs, &Some(super::TranslateParams{ k: 31, threshold: 22 })); + let got = super::translate_runs(&input_ms, &input_runs, 31, 22); assert_eq!(got, expected); } } From 797f04c4221d8bcfa155a4c3fab8e307544952e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 20:58:40 +0300 Subject: [PATCH 017/103] Rename map.rs -> translate.rs --- src/lib.rs | 14 +++++----- src/{map.rs => translate.rs} | 53 +++++++++++++++++++++++------------- 2 files changed, 41 insertions(+), 26 deletions(-) rename src/{map.rs => translate.rs} (92%) diff --git a/src/lib.rs b/src/lib.rs index d74be81..6c758bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,9 +14,9 @@ use log::info; use sbwt::SbwtIndexVariant; -pub mod index; -pub mod map; pub mod format; +pub mod index; +pub mod translate; pub fn map( query_file: &String, @@ -25,7 +25,7 @@ pub fn map( ) -> (Vec, Vec) { let (k, threshold) = match sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), map::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4 as usize, 0.0000001 as f64)) + (sbwt.k(), translate::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4 as usize, 0.0000001 as f64)) }, }; // TODO handle multiple files and `input_list` @@ -34,10 +34,10 @@ pub fn map( info!("Translating result..."); let ms_fw = ms.iter().map(|x| x.0).collect::>(); let ms_rev = ms.iter().map(|x| x.1).collect::>(); - let runs = (map::derandomize_ms(&ms_fw, k, threshold), - map::derandomize_ms(&ms_rev, k, threshold)); - let aln = (map::translate_runs(&ms_fw, &runs.0, k, threshold), - map::translate_runs(&ms_rev, &runs.1, k, threshold)); + let runs = (translate::derandomize_ms_vec(&ms_fw, k, threshold), + translate::derandomize_ms_vec(&ms_rev, k, threshold)); + let aln = (translate::translate_ms_vec(&ms_fw, &runs.0, k, threshold), + translate::translate_ms_vec(&ms_rev, &runs.1, k, threshold)); return aln; } diff --git a/src/map.rs b/src/translate.rs similarity index 92% rename from src/map.rs rename to src/translate.rs index e05aab9..e61c32f 100644 --- a/src/map.rs +++ b/src/translate.rs @@ -11,6 +11,21 @@ // the MIT license, or , // at your option. // +//! Translating deterministic _k_-bounded matching statistics into an +//! alignment representation and derandomizing noisy _k_-bounded +//! matching statistics. +//! +//! This module provides direct access to the functions sablast uses +//! to convert SBWT query results into alignments. Use the functions +//! in the main crate to run the whole pipeline. +//! +//! ## Derandomizing algorithm for noisy inputs +//! TODO Write details about how the derandomizing works. +//! +//! ## Translation algorithm for _k_-bounded matching statistics +//! TODO Describe how the different MS vectors translate into alignments. + + /// Evaluates the CDF of _k_-bounded matching statistics random match distribution. /// /// Computes the log-probability that a matching statistic with value @@ -89,29 +104,29 @@ pub fn random_match_threshold( /// /// TODO Add examples to ms_to_run documentation /// -pub fn ms_to_run( - current_ms: usize, - next_run: i64, +pub fn derandomize_ms_val( + curr_noisy_ms: usize, + next_derand_ms: i64, threshold: usize, k: usize, ) -> i64 { assert!(k > 0); assert!(threshold > 1); - assert!(current_ms <= k); - assert!(next_run <= k as i64); + assert!(curr_noisy_ms <= k); + assert!(next_derand_ms <= k as i64); // Default is to decrease MS by 1. - let mut run: i64 = next_run - 1; + let mut run: i64 = next_derand_ms - 1; - if current_ms == k { + if curr_noisy_ms == k { // Beginning of a full k-mer match run = k as i64; } - if current_ms > threshold && next_run < current_ms as i64 { + if curr_noisy_ms > threshold && next_derand_ms < curr_noisy_ms as i64 { // Beginning of a partial k-mer match // Only works if threshold > 1 - run = current_ms as i64; + run = curr_noisy_ms as i64; } return run; @@ -127,7 +142,7 @@ pub fn ms_to_run( /// # Examples /// TODO Add examples to derandomize_ms documentation /// -pub fn derandomize_ms( +pub fn derandomize_ms_vec( ms: &[usize], k: usize, threshold: usize, @@ -143,13 +158,13 @@ pub fn derandomize_ms( // Traverse the matching statistics in reverse. runs[len - 1] = ms[len - 1] as i64; for i in 2..len { - runs[len - i] = ms_to_run(ms[len - i], runs[len - i + 1], threshold, k); + runs[len - i] = derandomize_ms_val(ms[len - i], runs[len - i + 1], threshold, k); } return runs; } -/// Converts a derandomized _k_-bounded matching statistics vec to an alignment vec. +/// Translates a derandomized _k_-bounded matching statistics vec into an alignment. /// /// Iterates over a derandomized sequence of _k_bounded matching /// statistics `runs` and creates a sequence containing a character @@ -166,9 +181,9 @@ pub fn derandomize_ms( /// or insertion of _k_-mers from elsewhere in the reference into the query. /// /// # Examples -/// TODO Add examples to translate_runs documentation. +/// TODO Add examples to translate_ms_vec documentation. /// -pub fn translate_runs( +pub fn translate_ms_vec( ms: &[usize], runs: &[i64], k: usize, @@ -251,25 +266,25 @@ mod tests { factor.for_each(|i| assert_eq!(super::random_match_threshold(k, n_kmers, alphabet_size, (0.01_f64).powf(i as f64)), expected[i - 1])); } - // TODO Test cases for ms_to_run + // TODO Test cases for derandomize_ms_val // TODO Test cases for run_to_aln #[test] - fn derandomize_ms() { + fn derandomize_ms_vec() { let input = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; let expected = vec![0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let got = super::derandomize_ms(&input, 31, 22); + let got = super::derandomize_ms_vec(&input, 31, 22); assert_eq!(got, expected); } #[test] - fn translate_runs() { + fn translate_ms_vec() { let expected = vec!['M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; let input_ms: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; let input_runs: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let got = super::translate_runs(&input_ms, &input_runs, 31, 22); + let got = super::translate_ms_vec(&input_ms, &input_runs, 31, 22); assert_eq!(got, expected); } } From 1264c755db96a61b2ccbd987f02fb9a08cc9e37f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Sun, 22 Sep 2024 21:21:19 +0300 Subject: [PATCH 018/103] Split translate.rs into translate.rs and derandomize.rs --- src/derandomize.rs | 195 +++++++++++++++++++++++++++++++++ src/lib.rs | 7 +- src/translate.rs | 262 ++++++++++----------------------------------- 3 files changed, 253 insertions(+), 211 deletions(-) create mode 100644 src/derandomize.rs diff --git a/src/derandomize.rs b/src/derandomize.rs new file mode 100644 index 0000000..cb3cd42 --- /dev/null +++ b/src/derandomize.rs @@ -0,0 +1,195 @@ +// sablast: Spectral Burrows-Wheeler transform accelerated local alignment search +// +// Copyright 2024 Tommi Mäklin [tommi@maklin.fi]. + +// Copyrights in this project are retained by contributors. No copyright assignment +// is required to contribute to this project. + +// Except as otherwise noted (below and/or in individual files), this +// project is licensed under the Apache License, Version 2.0 +// or or +// the MIT license, or , +// at your option. +// +//! Derandomizing noisy _k_-bounded matching statistics. +//! +//! ## Derandomizing algorithm for noisy inputs +//! TODO Write details about how the derandomizing works. + +/// Evaluates the CDF of _k_-bounded matching statistics random match distribution. +/// +/// Computes the log-probability that a matching statistic with value +/// `t` or less that is the result of mapping a _k_-mer with +/// `alphabet_size` possible characters against an index containing +/// `n_kmers` _k_-mers was generated by chance. +/// +/// # Examples +/// TODO Add examples to log_rm_max_cdf +/// +/// # Distribution of random matches in _k_-bounded matching statistics +/// TODO Add the maths +/// +/// Credit to Jarno N. Alanko for deriving the random match distribution. +/// +pub fn log_rm_max_cdf( + t: usize, + alphabet_size: usize, + n_kmers: usize, +) -> f64 { + assert!(n_kmers > 0); + assert!(alphabet_size > 0); + + n_kmers as f64 * (- ((1.0_f64.ln() - (alphabet_size as f64).ln()).exp()).powi(t as i32 + 1)).ln_1p() +} + +/// Determines a lower bound for non-random _k_-bounded matching statistic values. +/// +/// Computes the probabilities that the possible values for the +/// _k_-bounded matching statistics (MS) of a _k_-mer with size `k` +/// mapped against an index with `n_kmers` total _k_-mers and +/// `alphabet_size` possible values at each character are random +/// matches. Computation terminates when the MS value that produces a +/// random match probability below `max_error_prob` is found and +/// returned. +/// +/// If no MS value passes the check, the function returns `k` instead. +/// +/// # Examples +/// TODO Add examples to random_match_threshold documentation +/// +pub fn random_match_threshold( + k: usize, + n_kmers: usize, + alphabet_size: usize, + max_error_prob: f64, +) -> usize { + assert!(k > 0); + assert!(n_kmers > 0); + assert!(alphabet_size > 0); + assert!(max_error_prob <= 1 as f64); + assert!(max_error_prob > 0 as f64); + + for i in 1..k { + if log_rm_max_cdf(i, alphabet_size, n_kmers) > (-max_error_prob).ln_1p() { + return i; + } + } + return k; +} + +/// Derandomizes a single noisy _k_-bounded matching statistic. +/// +/// Derandomizes the `current_ms` matching statistic (MS) based on the +/// `next_run` value obtained from the output of this function for the +/// next noisy MS when read left-to-right, the _k_-mer size `k`, and +/// the `threshold` which specifies a lower bound to consider the MS a +/// non-random match. +/// +/// Positive values of the output i64 value mean that i64 characters +/// from the beginning of the k-mer match the reference, ie. same as +/// the MS, while negative values denote distance from the last +/// character in the last _k_-mer that produced a match. +/// +/// # Examples +/// +/// TODO Add examples to ms_to_run documentation +/// +pub fn derandomize_ms_val( + curr_noisy_ms: usize, + next_derand_ms: i64, + threshold: usize, + k: usize, +) -> i64 { + assert!(k > 0); + assert!(threshold > 1); + assert!(curr_noisy_ms <= k); + assert!(next_derand_ms <= k as i64); + + // Default is to decrease MS by 1. + let mut run: i64 = next_derand_ms - 1; + + if curr_noisy_ms == k { + // Beginning of a full k-mer match + run = k as i64; + } + + if curr_noisy_ms > threshold && next_derand_ms < curr_noisy_ms as i64 { + // Beginning of a partial k-mer match + // Only works if threshold > 1 + run = curr_noisy_ms as i64; + } + + return run; +} + +/// Derandomizes a sequence of noisy _k_-bounded matching statistics. +/// +/// Iterates over a sequence of noisy _k_bounded matching statistics +/// `ms` in reverse to identify values that are the result of random +/// matching between _k_-mers of size `k` and an index that the lower +/// bound `threshold` was calculated for. +/// +/// # Examples +/// TODO Add examples to derandomize_ms documentation +/// +pub fn derandomize_ms_vec( + ms: &[usize], + k: usize, + threshold: usize, +) -> Vec { + assert!(k > 0); + assert!(threshold > 1); + assert!(ms.len() > 2); + + let len = ms.len(); + + let mut runs: Vec = vec![0; len]; + + // Traverse the matching statistics in reverse. + runs[len - 1] = ms[len - 1] as i64; + for i in 2..len { + runs[len - i] = derandomize_ms_val(ms[len - i], runs[len - i + 1], threshold, k); + } + + return runs; +} + +//////////////////////////////////////////////////////////////////////////////// +// Tests +// +#[cfg(test)] +mod tests { + use assert_approx_eq::assert_approx_eq; + + #[test] + fn log_rm_max_cdf() { + let expected = vec![-1306319.1078024083,-318761.2492719044,-79220.9269610741,-19776.1823255263,-4942.2344281681,-1235.4454790664,-308.8543003470,-77.2131332649,-19.3032557026,-4.8258121998,-1.2064529421,-0.3016132288,-0.0754033068,-0.0188508267,-0.0047127067,-0.0011781767,-0.0002945442,-0.0000736360,-0.0000184090,-0.0000046023,-0.0000011506,-0.0000002876,-0.0000000719,-0.0000000180,-0.0000000045,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000]; + let alphabet_size = 4; + let n_kmers = 20240921; + let k = 1..32; + k.for_each(|t| assert_approx_eq!(super::log_rm_max_cdf(t, alphabet_size, n_kmers), expected[t - 1], 1e-8f64)); + } + + #[test] + fn random_match_threshold() { + let expected = vec![15,18,22,25,28]; + let alphabet_size = 4; + let n_kmers = 20240921; + let k = 31; + let factor = 1..6; + factor.for_each(|i| assert_eq!(super::random_match_threshold(k, n_kmers, alphabet_size, (0.01_f64).powf(i as f64)), expected[i - 1])); + } + + // TODO Test cases for derandomize_ms_val + + // TODO Test cases for run_to_aln + + #[test] + fn derandomize_ms_vec() { + let input = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; + let expected = vec![0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; + + let got = super::derandomize_ms_vec(&input, 31, 22); + assert_eq!(got, expected); + } +} diff --git a/src/lib.rs b/src/lib.rs index 6c758bd..85c547b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,7 @@ use log::info; use sbwt::SbwtIndexVariant; +pub mod derandomize; pub mod format; pub mod index; pub mod translate; @@ -25,7 +26,7 @@ pub fn map( ) -> (Vec, Vec) { let (k, threshold) = match sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), translate::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4 as usize, 0.0000001 as f64)) + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4 as usize, 0.0000001 as f64)) }, }; // TODO handle multiple files and `input_list` @@ -34,8 +35,8 @@ pub fn map( info!("Translating result..."); let ms_fw = ms.iter().map(|x| x.0).collect::>(); let ms_rev = ms.iter().map(|x| x.1).collect::>(); - let runs = (translate::derandomize_ms_vec(&ms_fw, k, threshold), - translate::derandomize_ms_vec(&ms_rev, k, threshold)); + let runs = (derandomize::derandomize_ms_vec(&ms_fw, k, threshold), + derandomize::derandomize_ms_vec(&ms_rev, k, threshold)); let aln = (translate::translate_ms_vec(&ms_fw, &runs.0, k, threshold), translate::translate_ms_vec(&ms_rev, &runs.1, k, threshold)); diff --git a/src/translate.rs b/src/translate.rs index e61c32f..f003ec1 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -12,174 +12,76 @@ // at your option. // //! Translating deterministic _k_-bounded matching statistics into an -//! alignment representation and derandomizing noisy _k_-bounded -//! matching statistics. +//! alignment representation. //! -//! This module provides direct access to the functions sablast uses -//! to convert SBWT query results into alignments. Use the functions -//! in the main crate to run the whole pipeline. -//! -//! ## Derandomizing algorithm for noisy inputs -//! TODO Write details about how the derandomizing works. +//! The translated alignment is encoded using the following characters: +//! - **M** : Match between query and reference. +//! - **-** : Characters in the query that are not found in the reference. +//! - **X** : Single character mismatch or insertion into the query. +//! - **R** : Two consecutive 'R's signify a discontinuity in the alignment. +//! The right 'R' is at the start of a _k_-mer that is not adjacent +//! to the last character in the _k_-mer corresponding to the left +//! 'R'. This implies either a deletion of unknown length in the query, +//! or insertion of _k_-mers from elsewhere in the reference into the query. //! //! ## Translation algorithm for _k_-bounded matching statistics //! TODO Describe how the different MS vectors translate into alignments. - -/// Evaluates the CDF of _k_-bounded matching statistics random match distribution. -/// -/// Computes the log-probability that a matching statistic with value -/// `t` or less that is the result of mapping a _k_-mer with -/// `alphabet_size` possible characters against an index containing -/// `n_kmers` _k_-mers was generated by chance. -/// -/// # Examples -/// TODO Add examples to log_rm_max_cdf -/// -/// # Distribution of random matches in _k_-bounded matching statistics -/// TODO Add the maths -/// -/// Credit to Jarno N. Alanko for deriving the random match distribution. -/// -pub fn log_rm_max_cdf( - t: usize, - alphabet_size: usize, - n_kmers: usize, -) -> f64 { - assert!(n_kmers > 0); - assert!(alphabet_size > 0); - - n_kmers as f64 * (- ((1.0_f64.ln() - (alphabet_size as f64).ln()).exp()).powi(t as i32 + 1)).ln_1p() -} - -/// Determines a lower bound for non-random _k_-bounded matching statistic values. -/// -/// Computes the probabilities that the possible values for the -/// _k_-bounded matching statistics (MS) of a _k_-mer with size `k` -/// mapped against an index with `n_kmers` total _k_-mers and -/// `alphabet_size` possible values at each character are random -/// matches. Computation terminates when the MS value that produces a -/// random match probability below `max_error_prob` is found and -/// returned. -/// -/// If no MS value passes the check, the function returns `k` instead. -/// -/// # Examples -/// TODO Add examples to random_match_threshold documentation -/// -pub fn random_match_threshold( - k: usize, - n_kmers: usize, - alphabet_size: usize, - max_error_prob: f64, -) -> usize { - assert!(k > 0); - assert!(n_kmers > 0); - assert!(alphabet_size > 0); - assert!(max_error_prob <= 1 as f64); - assert!(max_error_prob > 0 as f64); - - for i in 1..k { - if log_rm_max_cdf(i, alphabet_size, n_kmers) > (-max_error_prob).ln_1p() { - return i; - } - } - return k; -} - -/// Derandomizes a single noisy _k_-bounded matching statistic. +/// Translates a single derandomized _k_-bounded matching statistics into an alignment. /// -/// Derandomizes the `current_ms` matching statistic (MS) based on the -/// `next_run` value obtained from the output of this function for the -/// next noisy MS when read left-to-right, the _k_-mer size `k`, and -/// the `threshold` which specifies a lower bound to consider the MS a -/// non-random match. +/// Translates the current derandomized matching statistic (MS) based +/// on the values of its left and right neighbors and the lower bound +/// for random matches. /// -/// Positive values of the output i64 value mean that i64 characters -/// from the beginning of the k-mer match the reference, ie. same as -/// the MS, while negative values denote distance from the last -/// character in the last _k_-mer that produced a match. +/// Returns a tuple containing the translation of the current MS and +/// translation of the right neighbor match. The right neighbor is an +/// empty string literal ' ' if translation of the current MS does not +/// affect its value. /// /// # Examples -/// -/// TODO Add examples to ms_to_run documentation -/// -pub fn derandomize_ms_val( - curr_noisy_ms: usize, - next_derand_ms: i64, +/// TODO add examples to translate_ms_val +pub fn translate_ms_val( + curr: i64, + next: i64, + prev: i64, threshold: usize, - k: usize, -) -> i64 { - assert!(k > 0); - assert!(threshold > 1); - assert!(curr_noisy_ms <= k); - assert!(next_derand_ms <= k as i64); - - // Default is to decrease MS by 1. - let mut run: i64 = next_derand_ms - 1; - - if curr_noisy_ms == k { - // Beginning of a full k-mer match - run = k as i64; - } - - if curr_noisy_ms > threshold && next_derand_ms < curr_noisy_ms as i64 { - // Beginning of a partial k-mer match - // Only works if threshold > 1 - run = curr_noisy_ms as i64; - } - - return run; -} - -/// Derandomizes a sequence of noisy _k_-bounded matching statistics. -/// -/// Iterates over a sequence of noisy _k_bounded matching statistics -/// `ms` in reverse to identify values that are the result of random -/// matching between _k_-mers of size `k` and an index that the lower -/// bound `threshold` was calculated for. -/// -/// # Examples -/// TODO Add examples to derandomize_ms documentation -/// -pub fn derandomize_ms_vec( - ms: &[usize], - k: usize, - threshold: usize, -) -> Vec { - assert!(k > 0); - assert!(threshold > 1); - assert!(ms.len() > 2); - - let len = ms.len(); - - let mut runs: Vec = vec![0; len]; - - // Traverse the matching statistics in reverse. - runs[len - 1] = ms[len - 1] as i64; - for i in 2..len { - runs[len - i] = derandomize_ms_val(ms[len - i], runs[len - i + 1], threshold, k); +) -> (char, char) { + let mut aln_curr = ' '; + let mut aln_next = ' '; + if curr > threshold as i64 && next > 0 && next < threshold as i64 { + // Current position is first character in a jump to another k-mer, + // or there is deletion of unknown length in the query wrt. the reference. + // + // Use two consecutive 'R's to denote breakpoint between two k-mers + aln_curr = 'R'; + aln_next = 'R'; + } else if curr <= 0 { + // Start of a mismatch region + if next == 1 && prev > 0 { + // Mismatched character or insertion of 1 character in the query. + // + // Use 'X' for mismatch or 1 character insert + aln_curr = 'X'; + } else { + // Insertion of more than 1 characters in the query + // + // Use '-' to denote inserts of more than 1 characters + aln_curr = '-'; + } + } else { + // Other values are always a match, use 'M' for these + aln_curr = 'M'; } - return runs; + (aln_curr, aln_next) } -/// Translates a derandomized _k_-bounded matching statistics vec into an alignment. +/// Translates a sequence of derandomized _k_-bounded matching statistics into an alignment. /// /// Iterates over a derandomized sequence of _k_bounded matching /// statistics `runs` and creates a sequence containing a character /// representation of the underlying alignment that generated `runs`. /// -/// The alignment is encoded using the following characters: -/// - **M** : Match between query and reference. -/// - **-** : Characters in the query that are not found in the reference. -/// - **X** : Single character mismatch or insertion into the query. -/// - **R** : Two consecutive 'R's signify a discontinuity in the alignment. -/// The right 'R' is at the start of a _k_-mer that is not adjacent -/// to the last character in the _k_-mer corresponding to the left -/// 'R'. This implies either a deletion of unknown length in the query, -/// or insertion of _k_-mers from elsewhere in the reference into the query. -/// /// # Examples /// TODO Add examples to translate_ms_vec documentation. /// @@ -207,32 +109,10 @@ pub fn translate_ms_vec( let mut aln_curr = res[pos]; let mut aln_next = if pos + 1 < len - 1 { res[pos + 1] } else { 'M' }; - if curr > threshold as i64 && next > 0 && next < threshold as i64 { - // Current position is first character in a jump to another k-mer, - // or there is deletion of unknown length in the query wrt. the reference. - // - // Use two consecutive 'R's to denote breakpoint between two k-mers - aln_curr = 'R'; - aln_next = 'R'; - } else if curr <= 0 { - // Start of a mismatch region - if next == 1 && prev > 0 { - // Mismatched character or insertion of 1 character in the query. - // - // Use 'X' for mismatch or 1 character insert - aln_curr = 'X'; - } else { - // Insertion of more than 1 characters in the query - // - // Use '-' to denote inserts of more than 1 characters - aln_curr = '-'; - } - } else { - // Other values are always a match, use 'M' for these - aln_curr = 'M'; - } + let (aln_curr, aln_next) = translate_ms_val(curr, next, prev, threshold); + res[pos] = aln_curr; - if pos + 1 < len - 1 { + if pos + 1 < len - 1 && aln_next != ' ' { res[pos + 1] = aln_next; } } @@ -245,40 +125,6 @@ pub fn translate_ms_vec( // #[cfg(test)] mod tests { - use assert_approx_eq::assert_approx_eq; - - #[test] - fn log_rm_max_cdf() { - let expected = vec![-1306319.1078024083,-318761.2492719044,-79220.9269610741,-19776.1823255263,-4942.2344281681,-1235.4454790664,-308.8543003470,-77.2131332649,-19.3032557026,-4.8258121998,-1.2064529421,-0.3016132288,-0.0754033068,-0.0188508267,-0.0047127067,-0.0011781767,-0.0002945442,-0.0000736360,-0.0000184090,-0.0000046023,-0.0000011506,-0.0000002876,-0.0000000719,-0.0000000180,-0.0000000045,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000,0.0000000000]; - let alphabet_size = 4; - let n_kmers = 20240921; - let k = 1..32; - k.for_each(|t| assert_approx_eq!(super::log_rm_max_cdf(t, alphabet_size, n_kmers), expected[t - 1], 1e-8f64)); - } - - #[test] - fn random_match_threshold() { - let expected = vec![15,18,22,25,28]; - let alphabet_size = 4; - let n_kmers = 20240921; - let k = 31; - let factor = 1..6; - factor.for_each(|i| assert_eq!(super::random_match_threshold(k, n_kmers, alphabet_size, (0.01_f64).powf(i as f64)), expected[i - 1])); - } - - // TODO Test cases for derandomize_ms_val - - // TODO Test cases for run_to_aln - - #[test] - fn derandomize_ms_vec() { - let input = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let expected = vec![0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - - let got = super::derandomize_ms_vec(&input, 31, 22); - assert_eq!(got, expected); - } - #[test] fn translate_ms_vec() { let expected = vec!['M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; From de6ce4338c37ad4c28e89a3e9d50f1a65940bbb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:28:13 +0300 Subject: [PATCH 019/103] Docs formatting. --- src/translate.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index f003ec1..4755a3c 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -11,23 +11,22 @@ // the MIT license, or , // at your option. // -//! Translating deterministic _k_-bounded matching statistics into an -//! alignment representation. +//! Translating deterministic _k_-bounded matching statistics into alignments. //! //! The translated alignment is encoded using the following characters: //! - **M** : Match between query and reference. //! - **-** : Characters in the query that are not found in the reference. //! - **X** : Single character mismatch or insertion into the query. //! - **R** : Two consecutive 'R's signify a discontinuity in the alignment. -//! The right 'R' is at the start of a _k_-mer that is not adjacent -//! to the last character in the _k_-mer corresponding to the left -//! 'R'. This implies either a deletion of unknown length in the query, -//! or insertion of _k_-mers from elsewhere in the reference into the query. +//! The right 'R' is at the start of a _k_-mer that is not adjacent +//! to the last character in the _k_-mer corresponding to the left +//! 'R'. This implies either a deletion of unknown length in the query, +//! or insertion of _k_-mers from elsewhere in the reference into the query. //! //! ## Translation algorithm for _k_-bounded matching statistics //! TODO Describe how the different MS vectors translate into alignments. -/// Translates a single derandomized _k_-bounded matching statistics into an alignment. +/// Translates a single derandomized _k_-bounded matching statistic. /// /// Translates the current derandomized matching statistic (MS) based /// on the values of its left and right neighbors and the lower bound @@ -76,7 +75,7 @@ pub fn translate_ms_val( (aln_curr, aln_next) } -/// Translates a sequence of derandomized _k_-bounded matching statistics into an alignment. +/// Translates a sequence of derandomized _k_-bounded matching statistics. /// /// Iterates over a derandomized sequence of _k_bounded matching /// statistics `runs` and creates a sequence containing a character From 397e739feeef96c5b76a24650bf37109771b5361 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:32:13 +0300 Subject: [PATCH 020/103] Better variable names. --- src/translate.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index 4755a3c..133b4cb 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -40,23 +40,23 @@ /// # Examples /// TODO add examples to translate_ms_val pub fn translate_ms_val( - curr: i64, - next: i64, - prev: i64, + ms_curr: i64, + ms_next: i64, + ms_prev: i64, threshold: usize, ) -> (char, char) { let mut aln_curr = ' '; let mut aln_next = ' '; - if curr > threshold as i64 && next > 0 && next < threshold as i64 { + if ms_curr > threshold as i64 && ms_next > 0 && ms_next < threshold as i64 { // Current position is first character in a jump to another k-mer, // or there is deletion of unknown length in the query wrt. the reference. // // Use two consecutive 'R's to denote breakpoint between two k-mers aln_curr = 'R'; aln_next = 'R'; - } else if curr <= 0 { + } else if ms_curr <= 0 { // Start of a mismatch region - if next == 1 && prev > 0 { + if ms_next == 1 && ms_prev > 0 { // Mismatched character or insertion of 1 character in the query. // // Use 'X' for mismatch or 1 character insert @@ -86,24 +86,24 @@ pub fn translate_ms_val( /// pub fn translate_ms_vec( ms: &[usize], - runs: &[i64], + derand_ms: &[i64], k: usize, threshold: usize, ) -> Vec { assert!(k > 0); assert!(threshold > 1); assert!(ms.len() == runs.len()); - assert!(runs.len() > 2); + assert!(derand_ms.len() > 2); - let len = runs.len(); + let len = derand_ms.len(); let mut res = vec![' '; len]; - // Traverse the runs + // Traverse the derandomized matchibng statistics for mut pos in 0..len { - let prev: i64 = if pos > 1 { runs[pos - 1] } else { 31 }; - let curr: i64 = runs[pos]; - let next: i64 = if pos < len - 1 { runs[pos + 1] } else { runs[pos] }; let curr_ms = ms[pos]; + let prev: i64 = if pos > 1 { derand_ms[pos - 1] } else { 31 }; + let curr: i64 = derand_ms[pos]; + let next: i64 = if pos < len - 1 { derand_ms[pos + 1] } else { derand_ms[pos] }; let mut aln_curr = res[pos]; let mut aln_next = if pos + 1 < len - 1 { res[pos + 1] } else { 'M' }; From 0249eab9ab811bec493f6a01a62504c55fff8a85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:32:26 +0300 Subject: [PATCH 021/103] Remove passing the noisy MS vec to translate_ms_vec (not needed). --- src/lib.rs | 4 ++-- src/translate.rs | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 85c547b..0f88c3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,8 +37,8 @@ pub fn map( let ms_rev = ms.iter().map(|x| x.1).collect::>(); let runs = (derandomize::derandomize_ms_vec(&ms_fw, k, threshold), derandomize::derandomize_ms_vec(&ms_rev, k, threshold)); - let aln = (translate::translate_ms_vec(&ms_fw, &runs.0, k, threshold), - translate::translate_ms_vec(&ms_rev, &runs.1, k, threshold)); + let aln = (translate::translate_ms_vec(&runs.0, k, threshold), + translate::translate_ms_vec(&runs.1, k, threshold)); return aln; } diff --git a/src/translate.rs b/src/translate.rs index 133b4cb..1ee4ca7 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -85,14 +85,12 @@ pub fn translate_ms_val( /// TODO Add examples to translate_ms_vec documentation. /// pub fn translate_ms_vec( - ms: &[usize], derand_ms: &[i64], k: usize, threshold: usize, ) -> Vec { assert!(k > 0); assert!(threshold > 1); - assert!(ms.len() == runs.len()); assert!(derand_ms.len() > 2); let len = derand_ms.len(); @@ -100,7 +98,6 @@ pub fn translate_ms_vec( // Traverse the derandomized matchibng statistics for mut pos in 0..len { - let curr_ms = ms[pos]; let prev: i64 = if pos > 1 { derand_ms[pos - 1] } else { 31 }; let curr: i64 = derand_ms[pos]; let next: i64 = if pos < len - 1 { derand_ms[pos + 1] } else { derand_ms[pos] }; @@ -127,9 +124,9 @@ mod tests { #[test] fn translate_ms_vec() { let expected = vec!['M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; - let input_ms: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; + // let input_ms: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; let input_runs: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let got = super::translate_ms_vec(&input_ms, &input_runs, 31, 22); + let got = super::translate_ms_vec(&input_runs, 31, 22); assert_eq!(got, expected); } } From dd6c4621571815bfaa91a98452b85bf19cbc18d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:37:07 +0300 Subject: [PATCH 022/103] Add asserts ti translate_ms_val --- src/translate.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/translate.rs b/src/translate.rs index 1ee4ca7..647f02d 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -45,6 +45,8 @@ pub fn translate_ms_val( ms_prev: i64, threshold: usize, ) -> (char, char) { + assert!(threshold > 1); + let mut aln_curr = ' '; let mut aln_next = ' '; if ms_curr > threshold as i64 && ms_next > 0 && ms_next < threshold as i64 { From 0f841f06ff27babde094737ad0344d3eb97fa619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:37:19 +0300 Subject: [PATCH 023/103] Update parameter names in documentation. --- src/translate.rs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index 647f02d..5242aee 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -28,9 +28,10 @@ /// Translates a single derandomized _k_-bounded matching statistic. /// -/// Translates the current derandomized matching statistic (MS) based -/// on the values of its left and right neighbors and the lower bound -/// for random matches. +/// Translates the current derandomized matching statistic (MS) +/// `ms_curr` based on the values of its left `ms_prev` and right +/// `ms_next` neighbors and the lower bound `threshold` for random +/// matches. /// /// Returns a tuple containing the translation of the current MS and /// translation of the right neighbor match. The right neighbor is an @@ -80,8 +81,11 @@ pub fn translate_ms_val( /// Translates a sequence of derandomized _k_-bounded matching statistics. /// /// Iterates over a derandomized sequence of _k_bounded matching -/// statistics `runs` and creates a sequence containing a character -/// representation of the underlying alignment that generated `runs`. +/// statistics `derand_ms` for _k_-mers with size `k` derandomized +/// with the threshold `threshold`. +/// +/// Returns a sequence containing a character representation of the +/// underlying alignment. /// /// # Examples /// TODO Add examples to translate_ms_vec documentation. From e2504c6345d92035c86e1775543fd2fbabdf863e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:41:59 +0300 Subject: [PATCH 024/103] Fix compiler warnings. --- src/translate.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index 5242aee..b3d4c54 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -48,8 +48,8 @@ pub fn translate_ms_val( ) -> (char, char) { assert!(threshold > 1); - let mut aln_curr = ' '; - let mut aln_next = ' '; + let aln_curr: char; + let mut aln_next: char = ' '; if ms_curr > threshold as i64 && ms_next > 0 && ms_next < threshold as i64 { // Current position is first character in a jump to another k-mer, // or there is deletion of unknown length in the query wrt. the reference. @@ -103,14 +103,11 @@ pub fn translate_ms_vec( let mut res = vec![' '; len]; // Traverse the derandomized matchibng statistics - for mut pos in 0..len { + for pos in 0..len { let prev: i64 = if pos > 1 { derand_ms[pos - 1] } else { 31 }; let curr: i64 = derand_ms[pos]; let next: i64 = if pos < len - 1 { derand_ms[pos + 1] } else { derand_ms[pos] }; - let mut aln_curr = res[pos]; - let mut aln_next = if pos + 1 < len - 1 { res[pos + 1] } else { 'M' }; - let (aln_curr, aln_next) = translate_ms_val(curr, next, prev, threshold); res[pos] = aln_curr; @@ -119,7 +116,7 @@ pub fn translate_ms_vec( } } - return res; + res } //////////////////////////////////////////////////////////////////////////////// From 81e6aee8c2ebaf9c6da0bdb55c0b6ec921acc931 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:51:58 +0300 Subject: [PATCH 025/103] Fix initialising derandomized ms vec when last MS is noise. --- src/derandomize.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index cb3cd42..97eb70c 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -146,7 +146,7 @@ pub fn derandomize_ms_vec( let mut runs: Vec = vec![0; len]; // Traverse the matching statistics in reverse. - runs[len - 1] = ms[len - 1] as i64; + runs[len - 1] = if ms[len - 1] > threshold { ms[len - 1]} else { 0 } as i64; for i in 2..len { runs[len - i] = derandomize_ms_val(ms[len - i], runs[len - i + 1], threshold, k); } From 1842d0aa74f6ae98d18424fa959491aa075bcd28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 09:52:40 +0300 Subject: [PATCH 026/103] Update test --- tests/map_clbs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index f6e4255..567497f 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -15,7 +15,7 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt(&"tests/data/clbS.fna.gz".to_string(), &None); - let expected = vec![(455, 967, 512, 1, '+'),(997, 1001, 5, 0, '+'),(998, 1001, 4, 0, '-')]; + let expected = vec![(455, 967, 512, 1, '+')]; let aln = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt::SbwtIndexVariant::SubsetMatrix(sbwt), &lcs.unwrap()); let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); From ffc15e064e0596424796bef5130b50e2c89cda88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 11:38:42 +0300 Subject: [PATCH 027/103] Add tests for translate_ms_val. --- src/translate.rs | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/src/translate.rs b/src/translate.rs index b3d4c54..c984b87 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -124,6 +124,96 @@ pub fn translate_ms_vec( // #[cfg(test)] mod tests { + // Test cases for translate_ms_val + // Comments use '-' for characters that are not in a ref or query sequence + #[test] + fn translate_ms_val_with_discontinuity() { + // Parameters : k = 3, threshold = 2 + // + // Ref sequence : A,C,G,T,T,T,C,A,G + // Query sequence : A,C,G,-,-,-,C,A,G + // + // Result MS vector : 1,2,3,1,2,3 + // Testing this pos : | + // Expected output : M,M,R,R,M,M + + let expected = ('R','R'); + let got = super::translate_ms_val(3, 1, 2, 2); + + assert_eq!(got, expected); + } + + #[test] + fn translate_ms_val_with_mismatch() { + // Parameters : k = 3, threshold = 2 + // + // Ref sequence : A,C,G,T,C,A,G + // Query sequence : A,C,G,C,C,A,G + // + // Result MS vector : 1,2,3,0,1,2,3 + // Testing this pos : | + // Expected output : M,M,M,X,M,M,M + + let expected = ('X',' '); + let got = super::translate_ms_val(0, 1, 3, 2); + + assert_eq!(got, expected); + } + + #[test] + fn translate_ms_val_with_single_insertion() { + // Parameters : k = 3, threshold = 2 + // + // Ref sequence : A,C,G,-,C,A,G + // Query sequence : A,C,G,C,C,A,G + // + // Result MS vector : 1,2,3,0,1,2,3 + // Testing this pos : | + // Expected output : M,M,M,X,M,M,M + + // Note this is identical to translate_ms_with_mismatch, these + // are indistinguishible in outputs but have different inputs. + // Kept here as a demonstration. + let expected = ('X', ' '); + let got = super::translate_ms_val(0, 1, 3, 2); + + assert_eq!(got, expected); + } + + #[test] + fn translate_ms_val_with_many_insertions() { + // Parameters : k = 3, threshold = 2 + // + // Ref sequence : A,C,G, -,-,C,A,G + // Query sequence : A,C,G, T,T,C,C,A,G + // + // Result MS vector : 1,2,3,-1,0,1,2,3 + // Testing this pos : | + // Expected output : M,M,M, -,-,M,M,M + + let expected = ('-', ' '); + let got = super::translate_ms_val(-1, 0, 3, 2); + + assert_eq!(got, expected); + } + + #[test] + fn translate_ms_val_with_only_matches() { + // Parameters : k = 3, threshold = 2 + // + // Ref sequence : A,C,G,C,A,G + // Query sequence : A,C,G,C,A,G + // + // Result MS vector : 1,2,3,3,3,3 + // Testing this pos : | + // Expected output : M,M,M,M,M,M + + let expected = ('M', ' '); + let got = super::translate_ms_val(1, 2, 3, 2); + + assert_eq!(got, expected); + } + #[test] fn translate_ms_vec() { let expected = vec!['M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; From 3ba1264e2cb764f1522889f510e111d2de7f8a43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 11:59:22 +0300 Subject: [PATCH 028/103] Add a more comprehensives test for translate_ms_vec that fails. --- src/translate.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index c984b87..28acb77 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -216,10 +216,18 @@ mod tests { #[test] fn translate_ms_vec() { - let expected = vec!['M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','-','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M','M']; - // let input_ms: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let input_runs: Vec = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let got = super::translate_ms_vec(&input_runs, 31, 22); + // Parameters : k = 3, threshold = 2 + // TODO check the k-mers + // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G,C,G + // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G,T,C + // + // Result MS vector : 0,1,2,3, 1,2,3,0,1,2,3,-1,0,1,2,3,0,1 + // Expected output : X,M,M,R, R,M,M,X,M,M,M, -,-,M,M,M,-,- + + let input: Vec = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,0,1]; + let expected: Vec = vec!['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-']; + let got = super::translate_ms_vec(&input, 3, 2); + assert_eq!(got, expected); } } From 13c4b9b876f14ebb2e07c20a86021c8db0bc2cb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 12:52:14 +0300 Subject: [PATCH 029/103] Take &[u8] in query_sbwt instead of the file name. --- src/index.rs | 18 +++++------------- src/lib.rs | 16 ++++++++++++---- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/index.rs b/src/index.rs index 17c0a2c..f217c93 100644 --- a/src/index.rs +++ b/src/index.rs @@ -126,23 +126,15 @@ pub fn load_sbwt( } pub fn query_sbwt( - query_file: &String, + query: &[u8], index: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, -) -> Vec<(usize, usize)> { - match index { +) -> Vec { + let ms = match index { SbwtIndexVariant::SubsetMatrix(sbwt) => { - let mut reader = needletail::parse_fastx_file(&query_file).expect("valid path/file"); let streaming_index = sbwt::StreamingIndex::new(sbwt, lcs); - - // TODO handle input with multiple sequences - // implement as querying 1 record at a time - let Some(rec) = reader.next() else { panic!("Invalid query {}", query_file); }; - let seqrec = rec.expect("invalid_record"); - let seq = seqrec.normalize(true); - let ms_fw = streaming_index.matching_statistics(seq.sequence()); - let ms_rc = streaming_index.matching_statistics(seq.reverse_complement().sequence()); - return ms_fw.iter().zip(ms_rc.iter()).map(|x| (x.0.0, x.1.0)).collect(); + streaming_index.matching_statistics(query) }, }; + ms.iter().map(|x| x.0).collect() } diff --git a/src/lib.rs b/src/lib.rs index 0f88c3c..4f1ed48 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,7 @@ // at your option. // use log::info; +use needletail::Sequence; use sbwt::SbwtIndexVariant; pub mod derandomize; @@ -30,12 +31,19 @@ pub fn map( }, }; // TODO handle multiple files and `input_list` - let ms = index::query_sbwt(&query_file, &sbwt, &lcs); + + let mut reader = needletail::parse_fastx_file(query_file).expect("valid path/file"); + let Some(rec) = reader.next() else { panic!("Invalid query {}", query_file); }; + let seqrec = rec.expect("invalid_record"); + + let seq_fwd = seqrec.normalize(true); + let ms_fwd = index::query_sbwt(seq_fwd.sequence(), &sbwt, &lcs); + + let seq_rev = seq_fwd.reverse_complement(); + let ms_rev = index::query_sbwt(seq_rev.sequence(), &sbwt, &lcs); info!("Translating result..."); - let ms_fw = ms.iter().map(|x| x.0).collect::>(); - let ms_rev = ms.iter().map(|x| x.1).collect::>(); - let runs = (derandomize::derandomize_ms_vec(&ms_fw, k, threshold), + let runs = (derandomize::derandomize_ms_vec(&ms_fwd, k, threshold), derandomize::derandomize_ms_vec(&ms_rev, k, threshold)); let aln = (translate::translate_ms_vec(&runs.0, k, threshold), translate::translate_ms_vec(&runs.1, k, threshold)); From dcb3816dbd809b8a59d4c42aaf238a40a230c73c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 13:28:13 +0300 Subject: [PATCH 030/103] Rewrite and document load_sbwt --- src/index.rs | 42 +++++++++++++++++++++++++----------------- src/main.rs | 2 +- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/src/index.rs b/src/index.rs index f217c93..1f3ad09 100644 --- a/src/index.rs +++ b/src/index.rs @@ -100,29 +100,37 @@ pub fn serialize_sbwt( } } +/// Loads a prebuilt SBWT index and its LCS array from disk. +/// +/// Reads the SBWT index stored at `index_prefix` + ".sbwt" and the +/// LCS array at `index_prefix` + ".lcs". +/// +/// Returns a tuple containing the SBWT index variant and the LCS +/// array. +/// +/// Panics if the SBWT or the LCS file are not readable with +/// std::fs::File::open. +/// +/// # Examples +/// TODO Add examples to load_sbwt documentation. +/// pub fn load_sbwt( - index_prefix: String, + index_prefix: &str, ) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { - let mut indexfile = index_prefix.clone(); - let mut lcsfile = indexfile.clone(); - indexfile.push_str(".sbwt"); - lcsfile.push_str(".lcs"); + let indexfile = format!("{}.sbwt", index_prefix); + let lcsfile = format!("{}.lcs", index_prefix); - // Read sbwt - let mut index_reader = std::io::BufReader::new(std::fs::File::open(indexfile).unwrap()); + // Load sbwt + let sbwt_conn = std::fs::File::open(&indexfile).unwrap_or_else(|_| panic!("Expected SBWT at {}", indexfile)); + let mut index_reader = std::io::BufReader::new(sbwt_conn); let sbwt = sbwt::load_sbwt_index_variant(&mut index_reader).unwrap(); // Load the lcs array - let lcs = match std::fs::File::open(&lcsfile) { - Ok(f) => { - let mut lcs_reader = std::io::BufReader::new(f); - sbwt::LcsArray::load(&mut lcs_reader).unwrap() - } - Err(_) => { - panic!("No LCS array found at {}", lcsfile); - } - }; - return (sbwt, lcs); + let lcs_conn = std::fs::File::open(&lcsfile).unwrap_or_else(|_| panic!("Expected LCS array at {}", lcsfile)); + let mut lcs_reader = std::io::BufReader::new(lcs_conn); + let lcs = sbwt::LcsArray::load(&mut lcs_reader).unwrap(); + + (sbwt, lcs) } pub fn query_sbwt( diff --git a/src/main.rs b/src/main.rs index bfc0883..bbf0b37 100644 --- a/src/main.rs +++ b/src/main.rs @@ -74,7 +74,7 @@ fn main() { init_log(if *verbose { 2 } else { 1 }); info!("Loading SBWT index..."); - let (sbwt, lcs) = sablast::index::load_sbwt(index_prefix.clone().unwrap()); + let (sbwt, lcs) = sablast::index::load_sbwt(index_prefix.as_ref().unwrap()); // TODO Handle `--input-list in sablast map From d21abfa8a8de9c6e7b555585d9929deade76b5e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 13:28:27 +0300 Subject: [PATCH 031/103] Document query_sbwt --- src/index.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/index.rs b/src/index.rs index 1f3ad09..ca88679 100644 --- a/src/index.rs +++ b/src/index.rs @@ -133,6 +133,17 @@ pub fn load_sbwt( (sbwt, lcs) } +/// Queries an SBWT index for the _k_-bounded matching statistics. +/// +/// Matches the _k_-mers in `query` against the SBWT index `index` and +/// its longest common suffix array `lcs`. +/// +/// Returns a vector containing the _k_-bounded matching statistic at +/// the position of each element in the query. +/// +/// # Examples +/// TODO Add examples to query_sbwt documentation +/// pub fn query_sbwt( query: &[u8], index: &sbwt::SbwtIndexVariant, From 744ecdcf21025ab4ca254448c51700961418d1d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 13:42:54 +0300 Subject: [PATCH 032/103] Rewrote serialize_sbwt --- src/index.rs | 36 ++++++++++++++++++------------------ src/main.rs | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/index.rs b/src/index.rs index ca88679..06505c4 100644 --- a/src/index.rs +++ b/src/index.rs @@ -11,6 +11,7 @@ // the MIT license, or , // at your option. // +use std::io::Write; use std::ops::Deref; use std::path::PathBuf; @@ -79,25 +80,24 @@ pub fn build_sbwt( } pub fn serialize_sbwt( - sbwt: sbwt::SbwtIndex, - lcs: &Option, - params_in: &Option, + outfile_prefix: &str, + sbwt: &sbwt::SbwtIndex, + lcs: &sbwt::LcsArray, ) { - let params = params_in.clone().unwrap_or(SBWTParams::default()); - - let mut sbwt_outfile = params.index_prefix.clone().unwrap_or("sbwt".to_string()); - sbwt_outfile.push_str(".sbwt"); - let mut sbwt_out = std::io::BufWriter::new(std::fs::File::create(sbwt_outfile).unwrap()); - - sbwt.n_kmers(); - sbwt::write_sbwt_index_variant(&SbwtIndexVariant::SubsetMatrix(sbwt), &mut sbwt_out).unwrap(); - - if let Some(lcs) = lcs{ - let mut lcs_outfile = params.index_prefix.clone().unwrap_or("sbwt".to_string()); - lcs_outfile.push_str(".lcs"); - let mut lcs_out = std::io::BufWriter::new(std::fs::File::create(&lcs_outfile).unwrap()); - lcs.serialize(&mut lcs_out).unwrap(); - } + let sbwt_outfile = format!("{}.sbwt", outfile_prefix); + let lcs_outfile = format!("{}.lcs", outfile_prefix); + + // Write sbwt + let sbwt_conn = std::fs::File::create(&sbwt_outfile).unwrap_or_else(|_| panic!("Expected write access to {}", sbwt_outfile)); + let mut sbwt_out = std::io::BufWriter::new(sbwt_conn); + sbwt_out.write_all(&(b"SubsetMatrix".len() as u64).to_le_bytes()).expect("Serialized SBWT header part 1."); + sbwt_out.write_all(b"SubsetMatrix").expect("Serialized SBWT header part 2."); + sbwt.serialize(&mut sbwt_out).expect("Serialized SBWT index."); + + // Write lcs array + let lcs_conn = std::fs::File::create(&lcs_outfile).unwrap_or_else(|_| panic!("Expected write access to {}", lcs_outfile)); + let mut lcs_out = std::io::BufWriter::new(lcs_conn); + lcs.serialize(&mut lcs_out).expect("Serialized LCS array."); } /// Loads a prebuilt SBWT index and its LCS array from disk. diff --git a/src/main.rs b/src/main.rs index bbf0b37..c948186 100644 --- a/src/main.rs +++ b/src/main.rs @@ -62,7 +62,7 @@ fn main() { let (sbwt, lcs) = sablast::index::build_sbwt(&seq_files[0], &Some(sbwt_params.clone())); info!("Serializing SBWT index..."); - sablast::index::serialize_sbwt(sbwt, &lcs, &Some(sbwt_params)); + sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs.as_ref().unwrap()); }, Some(cli::Commands::Map { From bd6f65e8bd13cbb095b4b599b5495e09c86111af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 16:47:46 +0300 Subject: [PATCH 033/103] Rewrite build_index --- src/index.rs | 82 ++++++++++++++++++++++++++++++++++------------------ src/main.rs | 9 ++---- 2 files changed, 57 insertions(+), 34 deletions(-) diff --git a/src/index.rs b/src/index.rs index 06505c4..056ca6f 100644 --- a/src/index.rs +++ b/src/index.rs @@ -11,8 +11,8 @@ // the MIT license, or , // at your option. // +use std::ffi::OsString; use std::io::Write; -use std::ops::Deref; use std::path::PathBuf; use needletail::Sequence; @@ -22,61 +22,87 @@ use sbwt::SbwtIndexVariant; // Parameters for SBWT construction #[derive(Clone)] -pub struct SBWTParams { +pub struct BuildOpts { pub k: usize, pub add_revcomp: bool, pub num_threads: usize, pub mem_gb: usize, + pub temp_dir: Option, pub prefix_precalc: usize, - pub temp_dir: Option, - pub index_prefix: Option, } // Defaults -impl Default for SBWTParams { - fn default() -> SBWTParams { - SBWTParams { +impl Default for BuildOpts { + fn default() -> BuildOpts { + BuildOpts { k: 31, add_revcomp: false, num_threads: 1, mem_gb: 4, prefix_precalc: 8, temp_dir: None, - index_prefix: None, } } } +struct FastxStreamer { + inner: Box, + record: Vec +} + +impl sbwt::SeqStream for FastxStreamer { + fn stream_next(&mut self) -> Option<&[u8]> { + let rec = self.inner.next(); + match rec { + Some(Ok(seqrec)) => { + // Remove newlines and non IUPAC characters + let normalized = seqrec.normalize(true); + self.record = normalized.as_ref().to_vec(); + Some(&self.record) + }, + _ => None, + } + } +} + +/// Builds an SBWT index and its LCS array from a fasta or fastq file. +/// +/// Streams all valid DNA sequences from `infile` to the SBWT API +/// calls to build the SBWT index and LCS array. Use the [BuildOpts] +/// argument `build_options` to control the options and resources +/// passed to the index builder. +/// +/// Returns a tuple containing the SBWT index and the LCS array. +/// +/// Requires write access to some temporary directory. Path can be set +/// using temp_dir in BuildOpts; defaults to $TMPDIR on Unix if not set. +/// +/// # Examples +/// TODO Add examples to build_sbwt documentation. +/// pub fn build_sbwt( - infile: &String, - params_in: &Option, + infile: &str, + build_options: &Option, ) -> (sbwt::SbwtIndex, Option) { - let params = params_in.clone().unwrap_or(SBWTParams::default()); + // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns + let temp_dir = build_options.as_ref().unwrap().temp_dir.clone().unwrap_or(std::env::temp_dir().to_str().unwrap().to_string()); - let temp_dir = params.temp_dir.unwrap_or(std::env::temp_dir()); let algorithm = BitPackedKmerSorting::new() - .mem_gb(params.mem_gb) + .mem_gb(build_options.as_ref().unwrap().mem_gb) .dedup_batches(false) - .temp_dir(temp_dir.deref()); + .temp_dir(PathBuf::from(OsString::from(temp_dir)).as_path()); - let mut reader = needletail::parse_fastx_file(&infile.clone()).expect("valid path/file"); - - let mut seqs = vec!(); - while let Some(rec) = reader.next() { - let seqrec = rec.expect("invalid_record"); - let seq = seqrec.normalize(true); - seqs.push(seq.deref().to_owned()); - } + let reader = FastxStreamer{inner: needletail::parse_fastx_file(infile).expect("valid path/file"), record: Vec::new()}; let (sbwt, lcs) = SbwtIndexBuilder::new() - .k(params.k) - .n_threads(params.num_threads) - .add_rev_comp(params.add_revcomp) + .k(build_options.as_ref().unwrap().k) + .n_threads(build_options.as_ref().unwrap().num_threads) + .add_rev_comp(build_options.as_ref().unwrap().add_revcomp) .algorithm(algorithm) .build_lcs(true) - .precalc_length(params.prefix_precalc) - .run_from_vecs(&seqs); + .precalc_length(build_options.as_ref().unwrap().prefix_precalc) + .run(reader); - return (sbwt, lcs); + (sbwt, lcs) } pub fn serialize_sbwt( diff --git a/src/main.rs b/src/main.rs index c948186..db849b9 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,8 +11,6 @@ // the MIT license, or , // at your option. // -use std::ffi::OsString; - use clap::Parser; use log::info; @@ -47,11 +45,10 @@ fn main() { }) => { init_log(if *verbose { 2 } else { 1 }); - let sbwt_params = sablast::index::SBWTParams { + let sbwt_build_options = sablast::index::BuildOpts { num_threads: *num_threads, mem_gb: *mem_gb, - temp_dir: Some(std::path::PathBuf::from(OsString::from(temp_dir.clone().unwrap()))), - index_prefix: output_prefix.clone(), + temp_dir: temp_dir.clone(), ..Default::default() }; // TODO Handle --input-list in sablast build @@ -59,7 +56,7 @@ fn main() { // TODO Handle multiple inputs in sablast build info!("Building SBWT index..."); - let (sbwt, lcs) = sablast::index::build_sbwt(&seq_files[0], &Some(sbwt_params.clone())); + let (sbwt, lcs) = sablast::index::build_sbwt(&seq_files[0], &Some(sbwt_build_options)); info!("Serializing SBWT index..."); sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs.as_ref().unwrap()); From 9e9343a99465f077f99b6940a517381ac4385caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 17:23:17 +0300 Subject: [PATCH 034/103] Rename build_sbwt to build_sbwt_from_file --- src/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index 056ca6f..4d03db9 100644 --- a/src/index.rs +++ b/src/index.rs @@ -79,7 +79,7 @@ impl sbwt::SeqStream for FastxStreamer { /// # Examples /// TODO Add examples to build_sbwt documentation. /// -pub fn build_sbwt( +pub fn build_sbwt_from_file( infile: &str, build_options: &Option, ) -> (sbwt::SbwtIndex, Option) { From aa49d3488aea0ce83e173580b3d0b726cc344ae7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 17:23:39 +0300 Subject: [PATCH 035/103] Add build_sbwt_from_vecs to build from sequences in memory. --- src/index.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/index.rs b/src/index.rs index 4d03db9..6100253 100644 --- a/src/index.rs +++ b/src/index.rs @@ -105,6 +105,48 @@ pub fn build_sbwt_from_file( (sbwt, lcs) } +/// Builds an SBWT index and its LCS array from sequences in memory. +/// +/// Passes all character sequences in `slices` to the SBWT API calls +/// to build the SBWT index and LCS array. Use the [BuildOpts] +/// argument `build_options` to control the options and resources +/// passed to the index builder. +/// +/// Note this function considers all data in `slices` as belonging to +/// the same sequence, meaning that only one index will be built. +/// +/// Returns a tuple containing the SBWT index and the LCS array. +/// +/// Requires write access to some temporary directory. Path can be set +/// using temp_dir in BuildOpts; defaults to $TMPDIR on Unix if not set. +/// +/// # Examples +/// TODO Add examples to build_sbwt documentation. +/// +pub fn build_sbwt_from_vecs( + slices: &[Vec], + build_options: &Option, +) -> (sbwt::SbwtIndexVariant, Option) { + // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns + let temp_dir = build_options.as_ref().unwrap().temp_dir.clone().unwrap_or(std::env::temp_dir().to_str().unwrap().to_string()); + + let algorithm = BitPackedKmerSorting::new() + .mem_gb(build_options.as_ref().unwrap().mem_gb) + .dedup_batches(false) + .temp_dir(PathBuf::from(OsString::from(temp_dir)).as_path()); + + let (sbwt, lcs) = SbwtIndexBuilder::new() + .k(build_options.as_ref().unwrap().k) + .n_threads(build_options.as_ref().unwrap().num_threads) + .add_rev_comp(build_options.as_ref().unwrap().add_revcomp) + .algorithm(algorithm) + .build_lcs(true) + .precalc_length(build_options.as_ref().unwrap().prefix_precalc) + .run_from_vecs(slices); + + (SbwtIndexVariant::SubsetMatrix(sbwt), lcs) +} + pub fn serialize_sbwt( outfile_prefix: &str, sbwt: &sbwt::SbwtIndex, From 90d0c37c6826e266e8a2f8ba612b07b14936fa5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 17:24:35 +0300 Subject: [PATCH 036/103] Always use sbwt::IndexVariant outside of index.rs. --- src/index.rs | 20 ++++++++++++-------- src/main.rs | 2 +- tests/map_clbs.rs | 4 ++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/index.rs b/src/index.rs index 6100253..83cf235 100644 --- a/src/index.rs +++ b/src/index.rs @@ -82,7 +82,7 @@ impl sbwt::SeqStream for FastxStreamer { pub fn build_sbwt_from_file( infile: &str, build_options: &Option, -) -> (sbwt::SbwtIndex, Option) { +) -> (sbwt::SbwtIndexVariant, Option) { // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns let temp_dir = build_options.as_ref().unwrap().temp_dir.clone().unwrap_or(std::env::temp_dir().to_str().unwrap().to_string()); @@ -102,7 +102,7 @@ pub fn build_sbwt_from_file( .precalc_length(build_options.as_ref().unwrap().prefix_precalc) .run(reader); - (sbwt, lcs) + (SbwtIndexVariant::SubsetMatrix(sbwt), lcs) } /// Builds an SBWT index and its LCS array from sequences in memory. @@ -149,7 +149,7 @@ pub fn build_sbwt_from_vecs( pub fn serialize_sbwt( outfile_prefix: &str, - sbwt: &sbwt::SbwtIndex, + sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) { let sbwt_outfile = format!("{}.sbwt", outfile_prefix); @@ -160,7 +160,11 @@ pub fn serialize_sbwt( let mut sbwt_out = std::io::BufWriter::new(sbwt_conn); sbwt_out.write_all(&(b"SubsetMatrix".len() as u64).to_le_bytes()).expect("Serialized SBWT header part 1."); sbwt_out.write_all(b"SubsetMatrix").expect("Serialized SBWT header part 2."); - sbwt.serialize(&mut sbwt_out).expect("Serialized SBWT index."); + match sbwt { + SbwtIndexVariant::SubsetMatrix(index) => { + index.serialize(&mut sbwt_out).expect("Serialized SBWT index."); + }, + }; // Write lcs array let lcs_conn = std::fs::File::create(&lcs_outfile).unwrap_or_else(|_| panic!("Expected write access to {}", lcs_outfile)); @@ -214,12 +218,12 @@ pub fn load_sbwt( /// pub fn query_sbwt( query: &[u8], - index: &sbwt::SbwtIndexVariant, + sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec { - let ms = match index { - SbwtIndexVariant::SubsetMatrix(sbwt) => { - let streaming_index = sbwt::StreamingIndex::new(sbwt, lcs); + let ms = match sbwt { + SbwtIndexVariant::SubsetMatrix(index) => { + let streaming_index = sbwt::StreamingIndex::new(index, lcs); streaming_index.matching_statistics(query) }, }; diff --git a/src/main.rs b/src/main.rs index db849b9..10b864d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -56,7 +56,7 @@ fn main() { // TODO Handle multiple inputs in sablast build info!("Building SBWT index..."); - let (sbwt, lcs) = sablast::index::build_sbwt(&seq_files[0], &Some(sbwt_build_options)); + let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&seq_files[0], &Some(sbwt_build_options)); info!("Serializing SBWT index..."); sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs.as_ref().unwrap()); diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 567497f..127296e 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -13,10 +13,10 @@ // #[test] fn map_nissle_against_clbs() { - let (sbwt, lcs) = sablast::index::build_sbwt(&"tests/data/clbS.fna.gz".to_string(), &None); + let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); let expected = vec![(455, 967, 512, 1, '+')]; - let aln = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt::SbwtIndexVariant::SubsetMatrix(sbwt), &lcs.unwrap()); + let aln = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs.unwrap()); let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); From 04e5ad55c2b382f4f13364d8deeb95d6fb0f3b20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 17:29:00 +0300 Subject: [PATCH 037/103] Unwrap the lcs array when returning. --- src/index.rs | 8 ++++---- src/main.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/index.rs b/src/index.rs index 83cf235..7e9eebb 100644 --- a/src/index.rs +++ b/src/index.rs @@ -82,7 +82,7 @@ impl sbwt::SeqStream for FastxStreamer { pub fn build_sbwt_from_file( infile: &str, build_options: &Option, -) -> (sbwt::SbwtIndexVariant, Option) { +) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns let temp_dir = build_options.as_ref().unwrap().temp_dir.clone().unwrap_or(std::env::temp_dir().to_str().unwrap().to_string()); @@ -102,7 +102,7 @@ pub fn build_sbwt_from_file( .precalc_length(build_options.as_ref().unwrap().prefix_precalc) .run(reader); - (SbwtIndexVariant::SubsetMatrix(sbwt), lcs) + (SbwtIndexVariant::SubsetMatrix(sbwt), lcs.unwrap()) } /// Builds an SBWT index and its LCS array from sequences in memory. @@ -126,7 +126,7 @@ pub fn build_sbwt_from_file( pub fn build_sbwt_from_vecs( slices: &[Vec], build_options: &Option, -) -> (sbwt::SbwtIndexVariant, Option) { +) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns let temp_dir = build_options.as_ref().unwrap().temp_dir.clone().unwrap_or(std::env::temp_dir().to_str().unwrap().to_string()); @@ -144,7 +144,7 @@ pub fn build_sbwt_from_vecs( .precalc_length(build_options.as_ref().unwrap().prefix_precalc) .run_from_vecs(slices); - (SbwtIndexVariant::SubsetMatrix(sbwt), lcs) + (SbwtIndexVariant::SubsetMatrix(sbwt), lcs.unwrap()) } pub fn serialize_sbwt( diff --git a/src/main.rs b/src/main.rs index 10b864d..164ff01 100644 --- a/src/main.rs +++ b/src/main.rs @@ -59,7 +59,7 @@ fn main() { let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&seq_files[0], &Some(sbwt_build_options)); info!("Serializing SBWT index..."); - sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs.as_ref().unwrap()); + sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs); }, Some(cli::Commands::Map { From 5ac3fc47204276b451064cb5c2adc6c31d5652f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 17:34:26 +0300 Subject: [PATCH 038/103] Update calls to index:: --- tests/map_clbs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 127296e..25a392c 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -16,7 +16,7 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); let expected = vec![(455, 967, 512, 1, '+')]; - let aln = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs.unwrap()); + let aln = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); From 80ead9b139d45414b0864fb6568d98a304c1dc81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 17:38:44 +0300 Subject: [PATCH 039/103] Document index.rs with examples. --- src/index.rs | 99 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 5 deletions(-) diff --git a/src/index.rs b/src/index.rs index 7e9eebb..453a46c 100644 --- a/src/index.rs +++ b/src/index.rs @@ -11,6 +11,7 @@ // the MIT license, or , // at your option. // +//! Wrapper for using the [sbwt](https://docs.rs/sbwt) API to build and query SBWT indexes. use std::ffi::OsString; use std::io::Write; use std::path::PathBuf; @@ -20,7 +21,28 @@ use sbwt::BitPackedKmerSorting; use sbwt::SbwtIndexBuilder; use sbwt::SbwtIndexVariant; -// Parameters for SBWT construction +/// Controls the parameters and resources available to the SBWT construction algorithm. +/// +/// Used to specify values for: +/// - _k_-mer size `k`. +/// - Whether to reverse complement inputs or not `add_revcomp`. +/// - Number of threads `num_threads` to use. +/// - Amount of RAM available (in GB) before resorting to temporary disk space `mem_gb`. +/// - Temporary directory path `temp_dir`. +/// - Size of the precalculated lookup table stored in the index `prefix_precalc`. +/// +/// Implements [BuildOpts::default] with these values: +/// ```rust +/// sablast::index::BuildOpts { +/// k: 31, +/// add_revcomp: false, +/// num_threads: 1, +/// mem_gb: 4, +/// prefix_precalc: 8, +/// temp_dir: None, +/// }; +/// ``` +/// #[derive(Clone)] pub struct BuildOpts { pub k: usize, @@ -77,7 +99,15 @@ impl sbwt::SeqStream for FastxStreamer { /// using temp_dir in BuildOpts; defaults to $TMPDIR on Unix if not set. /// /// # Examples -/// TODO Add examples to build_sbwt documentation. +/// ```rust +/// use sablast::index::*; +/// +/// // Inputs +/// let reference_file = "tests/data/clbS.fna.gz"; +/// +/// // Build the SBWT +/// let (sbwt, lcs) = build_sbwt_from_file(&reference_file, &Some(BuildOpts::default())); +/// ``` /// pub fn build_sbwt_from_file( infile: &str, @@ -121,7 +151,15 @@ pub fn build_sbwt_from_file( /// using temp_dir in BuildOpts; defaults to $TMPDIR on Unix if not set. /// /// # Examples -/// TODO Add examples to build_sbwt documentation. +/// ```rust +/// use sablast::index::*; +/// +/// // Inputs +/// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; +/// +/// // Build the SBWT +/// let (sbwt, lcs) = build_sbwt_from_vecs(&reference, &Some(BuildOpts{ k: 3, ..Default::default() })); +/// ``` /// pub fn build_sbwt_from_vecs( slices: &[Vec], @@ -147,6 +185,30 @@ pub fn build_sbwt_from_vecs( (SbwtIndexVariant::SubsetMatrix(sbwt), lcs.unwrap()) } +/// Writes an SBWT index and its LCS array to disk. +/// +/// Creates the files `outfile_prefix` + ".sbwt" and `outfile_prefix` + +/// ".lcs" to store the SBWT index `sbwt` and the LCS array `lcs`. +/// +/// Panics if the output files cannot be created with +/// std::fs::File::create or are not writable by +/// std::io::BufWriter::new. +/// +/// # Examples +/// ```rust +/// use sablast::index::*; +/// +/// // Inputs +/// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; +/// +/// // Build the SBWT +/// let (sbwt, lcs) = build_sbwt_from_vecs(&reference, &Some(BuildOpts{ k: 3, ..Default::default() })); +/// +/// // Serialize the sbwt to $TMPDIR/serialized_index +/// let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index"; +/// serialize_sbwt(&index_prefix, &sbwt, &lcs); +/// ``` +/// pub fn serialize_sbwt( outfile_prefix: &str, sbwt: &sbwt::SbwtIndexVariant, @@ -184,7 +246,22 @@ pub fn serialize_sbwt( /// std::fs::File::open. /// /// # Examples -/// TODO Add examples to load_sbwt documentation. +/// ```rust +/// use sablast::index::*; +/// +/// // Inputs +/// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; +/// +/// // Build the SBWT +/// let (sbwt, lcs) = build_sbwt_from_vecs(&reference, &Some(BuildOpts{ k: 3, ..Default::default() })); +/// +/// // Serialize the sbwt to $TMPDIR/serialized_index +/// let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index"; +/// serialize_sbwt(&index_prefix, &sbwt, &lcs); +/// +/// // Load index +/// let (sbwt_loaded, lcs_loaded) = load_sbwt(&index_prefix); +/// ``` /// pub fn load_sbwt( index_prefix: &str, @@ -214,7 +291,19 @@ pub fn load_sbwt( /// the position of each element in the query. /// /// # Examples -/// TODO Add examples to query_sbwt documentation +/// ```rust +/// use sablast::index::*; +/// +/// // Inputs +/// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; +/// let query: Vec = vec![b'C',b'A',b'A',b'G',b'C',b'C',b'A',b'C',b'T',b'C',b'A',b'T',b'T',b'G',b'G',b'G',b'T',b'C']; +/// +/// // Build the SBWT +/// let (sbwt, lcs) = build_sbwt_from_vecs(&reference, &Some(BuildOpts{ k: 3, ..Default::default() })); +/// +/// // Run query +/// let ms = query_sbwt(&query, &sbwt, &lcs); +/// ``` /// pub fn query_sbwt( query: &[u8], From 1f2eb2929e83c3b7e929d213f3ab4a4f910b10ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 17:53:11 +0300 Subject: [PATCH 040/103] Add tests. --- src/index.rs | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/index.rs b/src/index.rs index 453a46c..8dc43de 100644 --- a/src/index.rs +++ b/src/index.rs @@ -318,3 +318,44 @@ pub fn query_sbwt( }; ms.iter().map(|x| x.0).collect() } + +//////////////////////////////////////////////////////////////////////////////// +// Tests +// +#[cfg(test)] +mod tests { + #[test] + fn build_and_query_sbwt() { + let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; + let query: Vec = vec![b'C',b'A',b'A',b'G',b'C',b'C',b'A',b'C',b'T',b'C',b'A',b'T',b'T',b'G',b'G',b'G',b'T',b'C']; + + let (sbwt, lcs) = super::build_sbwt_from_vecs(&reference, &Some(super::BuildOpts{ k: 3, ..Default::default() })); + + let expected = vec![1,2,2,3,2,2,3,2,1,2,3,1,1,1,2,3,1,2]; + let got = super::query_sbwt(&query, &sbwt, &lcs); + + assert_eq!(got, expected); + } + + #[test] + fn build_serialize_load_sbwt() { + let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; + let (sbwt, lcs) = super::build_sbwt_from_vecs(&reference, &Some(super::BuildOpts{ k: 3, ..Default::default() })); + + let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index"; + super::serialize_sbwt(&index_prefix, &sbwt, &lcs); + + let (sbwt_loaded, lcs_loaded) = super::load_sbwt(&index_prefix); + + assert_eq!(lcs, lcs_loaded); + match sbwt { + sbwt::SbwtIndexVariant::SubsetMatrix(ref index) => { + match sbwt { + sbwt::SbwtIndexVariant::SubsetMatrix(ref index_loaded) => { + assert_eq!(index, index_loaded); + }, + }; + }, + }; + } +} From cbef3f546542beba4ed4a10ac0b4d333e9c65c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 18:10:00 +0300 Subject: [PATCH 041/103] Fix calling fns that take Some(BuildOpts) with None. --- src/index.rs | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/index.rs b/src/index.rs index 8dc43de..db34de7 100644 --- a/src/index.rs +++ b/src/index.rs @@ -114,22 +114,23 @@ pub fn build_sbwt_from_file( build_options: &Option, ) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns - let temp_dir = build_options.as_ref().unwrap().temp_dir.clone().unwrap_or(std::env::temp_dir().to_str().unwrap().to_string()); + let build_opts = if build_options.is_some() { build_options.clone().unwrap() } else { BuildOpts::default() }; + let temp_dir = if build_opts.temp_dir.is_some() { build_opts.temp_dir.unwrap() } else { std::env::temp_dir().to_str().unwrap().to_string() }; let algorithm = BitPackedKmerSorting::new() - .mem_gb(build_options.as_ref().unwrap().mem_gb) + .mem_gb(build_opts.mem_gb) .dedup_batches(false) .temp_dir(PathBuf::from(OsString::from(temp_dir)).as_path()); let reader = FastxStreamer{inner: needletail::parse_fastx_file(infile).expect("valid path/file"), record: Vec::new()}; let (sbwt, lcs) = SbwtIndexBuilder::new() - .k(build_options.as_ref().unwrap().k) - .n_threads(build_options.as_ref().unwrap().num_threads) - .add_rev_comp(build_options.as_ref().unwrap().add_revcomp) + .k(build_opts.k) + .n_threads(build_opts.num_threads) + .add_rev_comp(build_opts.add_revcomp) .algorithm(algorithm) .build_lcs(true) - .precalc_length(build_options.as_ref().unwrap().prefix_precalc) + .precalc_length(build_opts.prefix_precalc) .run(reader); (SbwtIndexVariant::SubsetMatrix(sbwt), lcs.unwrap()) @@ -165,21 +166,21 @@ pub fn build_sbwt_from_vecs( slices: &[Vec], build_options: &Option, ) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { - // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns - let temp_dir = build_options.as_ref().unwrap().temp_dir.clone().unwrap_or(std::env::temp_dir().to_str().unwrap().to_string()); + let build_opts = if build_options.is_some() { build_options.clone().unwrap() } else { BuildOpts::default() }; + let temp_dir = if build_opts.temp_dir.is_some() { build_opts.temp_dir.unwrap() } else { std::env::temp_dir().to_str().unwrap().to_string() }; let algorithm = BitPackedKmerSorting::new() - .mem_gb(build_options.as_ref().unwrap().mem_gb) + .mem_gb(build_opts.mem_gb) .dedup_batches(false) .temp_dir(PathBuf::from(OsString::from(temp_dir)).as_path()); let (sbwt, lcs) = SbwtIndexBuilder::new() - .k(build_options.as_ref().unwrap().k) - .n_threads(build_options.as_ref().unwrap().num_threads) - .add_rev_comp(build_options.as_ref().unwrap().add_revcomp) + .k(build_opts.k) + .n_threads(build_opts.num_threads) + .add_rev_comp(build_opts.add_revcomp) .algorithm(algorithm) .build_lcs(true) - .precalc_length(build_options.as_ref().unwrap().prefix_precalc) + .precalc_length(build_opts.prefix_precalc) .run_from_vecs(slices); (SbwtIndexVariant::SubsetMatrix(sbwt), lcs.unwrap()) From a33a0fba8931aa9160f6c356939a469a7c88ea3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 18:10:41 +0300 Subject: [PATCH 042/103] Fix failing test for translate_ms_vec. --- src/translate.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index 28acb77..4a2a162 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -108,11 +108,14 @@ pub fn translate_ms_vec( let curr: i64 = derand_ms[pos]; let next: i64 = if pos < len - 1 { derand_ms[pos + 1] } else { derand_ms[pos] }; - let (aln_curr, aln_next) = translate_ms_val(curr, next, prev, threshold); - - res[pos] = aln_curr; - if pos + 1 < len - 1 && aln_next != ' ' { - res[pos + 1] = aln_next; + // Two consecutive 'R's mean this pos was already set by the previous iteration + if !(pos > 1 && res[pos - 1] == 'R' && res[pos] == 'R') { + let (aln_curr, aln_next) = translate_ms_val(curr, next, prev, threshold); + + res[pos] = aln_curr; + if pos + 1 < len - 1 && aln_next != ' ' { + res[pos + 1] = aln_next; + } } } @@ -218,13 +221,13 @@ mod tests { fn translate_ms_vec() { // Parameters : k = 3, threshold = 2 // TODO check the k-mers - // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G,C,G - // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G,T,C + // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G + // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C // - // Result MS vector : 0,1,2,3, 1,2,3,0,1,2,3,-1,0,1,2,3,0,1 - // Expected output : X,M,M,R, R,M,M,X,M,M,M, -,-,M,M,M,-,- + // Result MS vector : 0,1,2,3, 1,2,3,0,1,2,3,-1,0,1,2,3,-1,0 + // Expected output : X,M,M,R, R,M,M,X,M,M,M, -,-,M,M,M, -,- - let input: Vec = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,0,1]; + let input: Vec = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0]; let expected: Vec = vec!['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-']; let got = super::translate_ms_vec(&input, 3, 2); From dc0ef3d239f68185fcd2e94590be7a1ab064e9c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 18:11:18 +0300 Subject: [PATCH 043/103] Clear todo in translate_ms_vec test. --- src/translate.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/translate.rs b/src/translate.rs index 4a2a162..a6d7932 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -220,7 +220,6 @@ mod tests { #[test] fn translate_ms_vec() { // Parameters : k = 3, threshold = 2 - // TODO check the k-mers // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C // From 96207762652d06f0e33b65be896121bc3431002b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 18:12:58 +0300 Subject: [PATCH 044/103] Add documentation examples to tests. --- .github/workflows/build_and_test.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index bbadcaf..4af30f4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -21,10 +21,14 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Setup + - name: Setup toolchain run: rustup update ${{ matrix.toolchain }} && rustup default ${{ matrix.toolchain }} - - name: Build + + - name: Build binary run: cargo build --verbose - - name: Run tests + - name: Run unit and integration tests run: cargo test --verbose + + - name: Run documenation examples as tests + run: cargo test --doc --verbose From fef4ebe3682c6acb0de9da025622296dd3181923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 19:08:44 +0300 Subject: [PATCH 045/103] Add input checks to build_sbwt_from_vecs and query_sbwt. --- src/index.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/index.rs b/src/index.rs index db34de7..a93b6d5 100644 --- a/src/index.rs +++ b/src/index.rs @@ -166,6 +166,8 @@ pub fn build_sbwt_from_vecs( slices: &[Vec], build_options: &Option, ) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { + assert!(slices.len() > 0); + let build_opts = if build_options.is_some() { build_options.clone().unwrap() } else { BuildOpts::default() }; let temp_dir = if build_opts.temp_dir.is_some() { build_opts.temp_dir.unwrap() } else { std::env::temp_dir().to_str().unwrap().to_string() }; @@ -311,6 +313,7 @@ pub fn query_sbwt( sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec { + assert!(query.len() > 0); let ms = match sbwt { SbwtIndexVariant::SubsetMatrix(index) => { let streaming_index = sbwt::StreamingIndex::new(index, lcs); From 347bc5023eaca215e1a63cb6f06030ae594ef7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 19:34:28 +0300 Subject: [PATCH 046/103] Add examples to translate.rs documentation. --- src/translate.rs | 111 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 4 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index a6d7932..77073c1 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -23,8 +23,6 @@ //! 'R'. This implies either a deletion of unknown length in the query, //! or insertion of _k_-mers from elsewhere in the reference into the query. //! -//! ## Translation algorithm for _k_-bounded matching statistics -//! TODO Describe how the different MS vectors translate into alignments. /// Translates a single derandomized _k_-bounded matching statistic. /// @@ -39,7 +37,101 @@ /// affect its value. /// /// # Examples -/// TODO add examples to translate_ms_val +/// ## Query with only matches +/// ```rust +/// use sablast::translate::translate_ms_val; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Ref sequence : A,C,G,C,A,G +/// // Query sequence : A,C,G,C,A,G +/// // +/// // Result MS vector : 1,2,3,3,3,3 +/// // Testing this pos : | +/// // Expected output : M,M,M,M,M,M +/// +/// translate_ms_val(1, 2, 3, 2); +/// ``` +/// +/// ## Query with a single mismatch +/// ```rust +/// use sablast::translate::translate_ms_val; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Ref sequence : A,C,G,T,C,A,G +/// // Query sequence : A,C,G,C,C,A,G +/// // +/// // Result MS vector : 1,2,3,0,1,2,3 +/// // Testing this pos : | +/// // Expected output : M,M,M,X,M,M,M +/// +/// translate_ms_val(0, 1, 3, 2); +/// ``` +/// +/// ## Query with a single insertion: +/// ```rust +/// use sablast::translate::translate_ms_val; +/// +/// // Ref sequence : A,C,G,-,C,A,G +/// // Query sequence : A,C,G,C,C,A,G +/// // +/// // Result MS vector : 1,2,3,0,1,2,3 +/// // Testing this pos : | +/// // Expected output : M,M,M,X,M,M,M +/// +/// translate_ms_val(0, 1, 3, 2); +/// ``` +/// +/// Note that this case is identical to the query with a single +/// mismatch. These two are indistinguishible based on the _k_-bounded +/// matching statistics alone although the input sequences are +/// different. +/// +/// ## Query with multiple insertions: +/// ```rust +/// use sablast::translate::translate_ms_val; +/// +/// // Ref sequence : A,C,G, -,-,C,A,G +/// // Query sequence : A,C,G, T,T,C,C,A,G +/// // +/// // Result MS vector : 1,2,3,-1,0,1,2,3 +/// // Testing this pos : | +/// // Expected output : M,M,M, -,-,M,M,M +/// +/// translate_ms_val(-1, 0, 3, 2); +/// ``` +/// ## Query with a deletion or recombination +/// ```rust +/// use sablast::translate::translate_ms_val; +/// +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Ref sequence : A,C,G,T,T,T,C,A,G +/// // Query sequence : A,C,G,-,-,-,C,A,G +/// // +/// // Result MS vector : 1,2,3,1,2,3 +/// // Testing this pos : | +/// // Expected output : M,M,R,R,M,M +/// +/// translate_ms_val(3, 1, 2, 2); +/// ``` +/// +/// Although in this case two characters have been deleted from the +/// query, if the missing region was longer the matching statistics +/// (MS) could also represent recombination of a sequence from +/// elsewhere in the query into the position preceding the three +/// consecutive T's in the reference. +/// +/// Recombinations and deletions are indistinguishable in the +/// _k_-bounded matching statistics alone but they can be solved by +/// comparing the MS vector with the reference and +/// query. Additionally, when a segment of reasonable length is +/// encapsulated by two consecutive R's on both the left and right +/// side, the region in between possibly originates from elsewhere in +/// the reference. +/// pub fn translate_ms_val( ms_curr: i64, ms_next: i64, @@ -88,7 +180,18 @@ pub fn translate_ms_val( /// underlying alignment. /// /// # Examples -/// TODO Add examples to translate_ms_vec documentation. +/// ```rust +/// use sablast::translate::translate_ms_vec; +/// +/// // Parameters : k = 3, threshold = 2 +/// // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G +/// // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C +/// // Input MS : 0,1,2,3, 1,2,3,0,1,2,3,-1,0,1,2,3,-1,0 +/// // Expected output : X,M,M,R, R,M,M,X,M,M,M, -,-,M,M,M, -,- +/// +/// let input: Vec = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0]; +/// translate_ms_vec(&input, 3, 2); +/// ``` /// pub fn translate_ms_vec( derand_ms: &[i64], From 1b28adb23fe0066c90af3725cc1cfc7347a92d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 19:35:43 +0300 Subject: [PATCH 047/103] Always run all tests. --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 4af30f4..9fcc6c8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -28,7 +28,7 @@ jobs: run: cargo build --verbose - name: Run unit and integration tests - run: cargo test --verbose + run: cargo test --no-fail-fast --verbose - name: Run documenation examples as tests run: cargo test --doc --verbose From 8fc1d73db4422b0e7aa8e1b716c2abe75177fffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 20:58:28 +0300 Subject: [PATCH 048/103] Add more documentation and examples. --- src/derandomize.rs | 73 +++++++++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index 97eb70c..99612b6 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -12,9 +12,6 @@ // at your option. // //! Derandomizing noisy _k_-bounded matching statistics. -//! -//! ## Derandomizing algorithm for noisy inputs -//! TODO Write details about how the derandomizing works. /// Evaluates the CDF of _k_-bounded matching statistics random match distribution. /// @@ -23,13 +20,20 @@ /// `alphabet_size` possible characters against an index containing /// `n_kmers` _k_-mers was generated by chance. /// +/// TODO Add formula to log_rm_max_cdf documentation +/// +/// Credit to Jarno N. Alanko for deriving the random match distribution. +/// /// # Examples -/// TODO Add examples to log_rm_max_cdf +/// ```rust +/// use sablast::derandomize::log_rm_max_cdf; /// -/// # Distribution of random matches in _k_-bounded matching statistics -/// TODO Add the maths +/// let alphabet_size = 4; +/// let n_kmers = 20240921; /// -/// Credit to Jarno N. Alanko for deriving the random match distribution. +/// let res = log_rm_max_cdf(10, alphabet_size, n_kmers); +/// // `res` is -4.825812199808644 +/// ``` /// pub fn log_rm_max_cdf( t: usize, @@ -55,8 +59,17 @@ pub fn log_rm_max_cdf( /// If no MS value passes the check, the function returns `k` instead. /// /// # Examples -/// TODO Add examples to random_match_threshold documentation +/// ```rust +/// use sablast::derandomize::random_match_threshold; +/// +/// let k = 31; +/// let n_kmers = 20240921; +/// let alphabet_size = 4; +/// let max_error_prob = 0.01_f64; /// +/// let threshold = random_match_threshold(k, n_kmers, alphabet_size, max_error_prob); +/// // `threshold` is 15 +/// ``` pub fn random_match_threshold( k: usize, n_kmers: usize, @@ -79,11 +92,11 @@ pub fn random_match_threshold( /// Derandomizes a single noisy _k_-bounded matching statistic. /// -/// Derandomizes the `current_ms` matching statistic (MS) based on the -/// `next_run` value obtained from the output of this function for the -/// next noisy MS when read left-to-right, the _k_-mer size `k`, and -/// the `threshold` which specifies a lower bound to consider the MS a -/// non-random match. +/// Derandomizes the `current_noisy_ms` matching statistic (MS) based +/// on the `next_derand_ms` value obtained from the output of this +/// function for the next noisy MS when read left-to-right, the +/// _k_-mer size `k`, and the `threshold` which specifies a lower +/// bound to consider the MS a non-random match. /// /// Positive values of the output i64 value mean that i64 characters /// from the beginning of the k-mer match the reference, ie. same as @@ -130,28 +143,36 @@ pub fn derandomize_ms_val( /// bound `threshold` was calculated for. /// /// # Examples -/// TODO Add examples to derandomize_ms documentation +/// ```rust +/// use sablast::derandomize::derandomize_ms_vec; +/// +/// let k = 3; +/// let threshold = 2; +/// let noisy_ms = vec![1,2,2,3,2,2,3,2,1,2,3,1,1,1,2,3,1,2]; +/// +/// let derand_ms = derandomize_ms_vec(&noisy_ms, k, threshold); +/// // `derand_ms` has [0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0] +/// ``` /// pub fn derandomize_ms_vec( - ms: &[usize], + noisy_ms: &[usize], k: usize, threshold: usize, ) -> Vec { assert!(k > 0); assert!(threshold > 1); - assert!(ms.len() > 2); + assert!(noisy_ms.len() > 2); - let len = ms.len(); - - let mut runs: Vec = vec![0; len]; + let len = noisy_ms.len(); + let mut derand_ms: Vec = vec![0; len]; // Traverse the matching statistics in reverse. - runs[len - 1] = if ms[len - 1] > threshold { ms[len - 1]} else { 0 } as i64; + derand_ms[len - 1] = if noisy_ms[len - 1] > threshold { noisy_ms[len - 1]} else { 0 } as i64; for i in 2..len { - runs[len - i] = derandomize_ms_val(ms[len - i], runs[len - i + 1], threshold, k); + derand_ms[len - i] = derandomize_ms_val(noisy_ms[len - i], derand_ms[len - i + 1], threshold, k); } - return runs; + derand_ms } //////////////////////////////////////////////////////////////////////////////// @@ -182,14 +203,12 @@ mod tests { // TODO Test cases for derandomize_ms_val - // TODO Test cases for run_to_aln - #[test] fn derandomize_ms_vec() { - let input = vec![1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,11,11,12,11,10,11,11,12,11,12,10,11,12,12,10,11,11,11,11,11,11,10,11,11,12,13,11,12,13,14,15,16,13,14,15,16,12,12,13,14,15,16,17,18,19,20,21,22,12,10,10,11,12,11,10,11,12,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,13,14,15,12,12,10,11,11,11,12,13,13,14,15,11,11,11,11,11,11,12,13,14,11,11,11,11,12,13,12,12,12,12,13,12,13,14,12,13,11,12,12,11,12,11,12,13,14,14,13,14,15,15,16,17,18,19,19,19,20,21,22,12,13,11,11,12,12,13,14,15,16,17,18,19,20,21,22,10,11,9,10,10,11,11,12,11,11,12,13,13,14,12,11,11,12,13,12,13,12,12,12,12,13,11,12,12,10,11,11,10,11,11,12,10,9,10,10,10,11,12,10,9,10,10,10,11,10,11,12,10,8,9,10,9,9,10,9,10,10,10,11,12,13,14,15,16,17,13,11,11,11,12,11,11,12,12,11,11,12,12,13,14,15,11,12,10,11,9,10,11,11,11,11,11,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,11,12,12,13,11,11,12,13,14,13,11,11,12,13,14,15,16,17,18,19,20,21,11,12,11,11,12,11,12,12,12,12,11,10,11,12,11,11,12,13,12,12,11,12,13,13,13,11,11,12,11,12,13,12,13,14,15,16,17,18,19,20,21,11,12,13,9,10,11,10,10,10,11,12,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; - let expected = vec![0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-164,-163,-162,-161,-160,-159,-158,-157,-156,-155,-154,-153,-152,-151,-150,-149,-148,-147,-146,-145,-144,-143,-142,-141,-140,-139,-138,-137,-136,-135,-134,-133,-132,-131,-130,-129,-128,-127,-126,-125,-124,-123,-122,-121,-120,-119,-118,-117,-116,-115,-114,-113,-112,-111,-110,-109,-108,-107,-106,-105,-104,-103,-102,-101,-100,-99,-98,-97,-96,-95,-94,-93,-92,-91,-90,-89,-88,-87,-86,-85,-84,-83,-82,-81,-80,-79,-78,-77,-76,-75,-74,-73,-72,-71,-70,-69,-68,-67,-66,-65,-64,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,-63,-62,-61,-60,-59,-58,-57,-56,-55,-54,-53,-52,-51,-50,-49,-48,-47,-46,-45,-44,-43,-42,-41,-40,-39,-38,-37,-36,-35,-34,-33,-32,-31,-30,-29,-28,-27,-26,-25,-24,-23,-22,-21,-20,-19,-18,-17,-16,-15,-14,-13,-12,-11,-10,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]; + let noisy_ms = vec![1,2,2,3,2,2,3,2,1,2,3,1,1,1,2,3,1,2]; + let expected = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0]; + let got = super::derandomize_ms_vec(&noisy_ms, 3, 2); - let got = super::derandomize_ms_vec(&input, 31, 22); assert_eq!(got, expected); } } From 4f32c0cd42a72bf6c7263f7afd64ee40d241cfc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 20:59:36 +0300 Subject: [PATCH 049/103] Fix race condition in documentation tests. --- src/index.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/index.rs b/src/index.rs index a93b6d5..d04449d 100644 --- a/src/index.rs +++ b/src/index.rs @@ -207,8 +207,8 @@ pub fn build_sbwt_from_vecs( /// // Build the SBWT /// let (sbwt, lcs) = build_sbwt_from_vecs(&reference, &Some(BuildOpts{ k: 3, ..Default::default() })); /// -/// // Serialize the sbwt to $TMPDIR/serialized_index -/// let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index"; +/// // Serialize the sbwt to $TMPDIR/serialized_index_1 +/// let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index_1"; /// serialize_sbwt(&index_prefix, &sbwt, &lcs); /// ``` /// @@ -258,8 +258,8 @@ pub fn serialize_sbwt( /// // Build the SBWT /// let (sbwt, lcs) = build_sbwt_from_vecs(&reference, &Some(BuildOpts{ k: 3, ..Default::default() })); /// -/// // Serialize the sbwt to $TMPDIR/serialized_index -/// let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index"; +/// // Serialize the sbwt to $TMPDIR/serialized_index_2 +/// let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index_2"; /// serialize_sbwt(&index_prefix, &sbwt, &lcs); /// /// // Load index @@ -346,7 +346,7 @@ mod tests { let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; let (sbwt, lcs) = super::build_sbwt_from_vecs(&reference, &Some(super::BuildOpts{ k: 3, ..Default::default() })); - let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index"; + let index_prefix = std::env::temp_dir().to_str().unwrap().to_owned() + "/serialized_index_test"; super::serialize_sbwt(&index_prefix, &sbwt, &lcs); let (sbwt_loaded, lcs_loaded) = super::load_sbwt(&index_prefix); From 55e95e123f28ecde19943f058c43cad8d655b6ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 21:20:55 +0300 Subject: [PATCH 050/103] Add test cases for derandomize_ms_val. --- src/derandomize.rs | 56 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index 99612b6..3d54ad3 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -201,7 +201,61 @@ mod tests { factor.for_each(|i| assert_eq!(super::random_match_threshold(k, n_kmers, alphabet_size, (0.01_f64).powf(i as f64)), expected[i - 1])); } - // TODO Test cases for derandomize_ms_val + #[test] + fn derandomize_ms_val_full_match() { + // Parameters : k = 3, threshold = 2 + // + // Noisy MS : 1,2,3,3,3 + // Derandomized MS : 1,2,3,3,3 + // Testing this pos : | + + let expected = 3; + let got = super::derandomize_ms_val(3, 3, 2, 3); + + assert_eq!(got, expected); + } + + #[test] + fn derandomize_ms_val_only_noise() { + // Parameters : k = 3, threshold = 2 + // + // Noisy MS : 0, 0, 2, 1,0 + // Derandomized MS : -4,-3,-2,-1,0 + // Testing this pos : | + + let expected = -2; + let got = super::derandomize_ms_val(2, -1, 2, 3); + + assert_eq!(got, expected); + } + + #[test] + fn derandomize_ms_val_beginning_of_full_match() { + // Parameters : k = 3, threshold = 2 + // + // Noisy MS : 1,2,3, 1,2 + // Derandomized MS : 1,2,3,-1,0 + // Testing this pos : | + + let expected = 3; + let got = super::derandomize_ms_val(3, -1, 2, 3); + + assert_eq!(got, expected); + } + + #[test] + fn derandomize_ms_val_beginning_of_partial_match() { + // Parameters : k = 4, threshold = 2 + // + // Noisy MS : 1,2,3,-1,0,1,2,3,4,4 + // Derandomized MS : 1,2,3,-1,0,1,2,3,4,4 + // Testing this pos : | + + let expected = 3; + let got = super::derandomize_ms_val(3, -1, 2, 4); + + assert_eq!(got, expected); + } #[test] fn derandomize_ms_vec() { From 604233534335046e6c07afc18c37d57618373f7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 21:21:11 +0300 Subject: [PATCH 051/103] Documentation and examples for derandomize.rs --- src/derandomize.rs | 57 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index 3d54ad3..a600af7 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -104,8 +104,61 @@ pub fn random_match_threshold( /// character in the last _k_-mer that produced a match. /// /// # Examples +/// ## Noisy MS has only matches +/// ```rust +/// use sablast::derandomize::derandomize_ms_val; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Noisy MS : 1,2,3,3,3 +/// // Derandomized MS : 1,2,3,3,3 +/// // Testing this pos : | +/// +/// let derand_ms = derandomize_ms_val(3, 3, 2, 3); +/// // `derand_ms` is 3 +/// ``` +/// +/// ## Noisy MS has only noise +/// ```rust +/// use sablast::derandomize::derandomize_ms_val; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Noisy MS : 0, 0, 2, 1,0 +/// // Derandomized MS : -4,-3,-2,-1,0 +/// // Testing this pos : | +/// +/// let derand_ms = derandomize_ms_val(2, -1, 2, 3); +/// // `derand_ms` is -2 +/// ``` /// -/// TODO Add examples to ms_to_run documentation +/// ## Noisy MS is at beginning of a full _k_-mer match +/// ```rust +/// use sablast::derandomize::derandomize_ms_val; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Noisy MS : 1,2,3, 1,2 +/// // Derandomized MS : 1,2,3,-1,0 +/// // Testing this pos : | +/// +/// let derand_ms = derandomize_ms_val(3, -1, 2, 3); +/// // `derand_ms` is 3 +/// ``` +/// +/// ## Noisy MS is at beginning of a partial _k_-mer match +/// ```rust +/// use sablast::derandomize::derandomize_ms_val; +/// +/// // Parameters : k = 4, threshold = 2 +/// // +/// // Noisy MS : 1,2,3,-1,0,1,2,3,4,4 +/// // Derandomized MS : 1,2,3,-1,0,1,2,3,4,4 +/// // Testing this pos : | +/// +/// let derand_ms = derandomize_ms_val(3, -1, 2, 4); +/// // `derand_ms` is 3 +/// ``` /// pub fn derandomize_ms_val( curr_noisy_ms: usize, @@ -128,7 +181,7 @@ pub fn derandomize_ms_val( if curr_noisy_ms > threshold && next_derand_ms < curr_noisy_ms as i64 { // Beginning of a partial k-mer match - // Only works if threshold > 1 + // Only useful if threshold > 1 and k > 3 run = curr_noisy_ms as i64; } From c458b2540ddfbd5b25376fa7fd3ca0644eab9adc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 21:23:11 +0300 Subject: [PATCH 052/103] Fix comparison in build_serialize_load_sbwt. --- src/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index d04449d..63d1258 100644 --- a/src/index.rs +++ b/src/index.rs @@ -354,7 +354,7 @@ mod tests { assert_eq!(lcs, lcs_loaded); match sbwt { sbwt::SbwtIndexVariant::SubsetMatrix(ref index) => { - match sbwt { + match sbwt_loaded { sbwt::SbwtIndexVariant::SubsetMatrix(ref index_loaded) => { assert_eq!(index, index_loaded); }, From f2fcf07e452f19b99b9215536683046e3fae5a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 21:49:36 +0300 Subject: [PATCH 053/103] Patch needletail to respect compression features; disable xz,bzip2. --- Cargo.toml | 14 ++++++-- build.rs | 4 +++ patches/needletail+0.5.1.patch | 61 ++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 build.rs create mode 100644 patches/needletail+0.5.1.patch diff --git a/Cargo.toml b/Cargo.toml index 86afc89..d099836 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ license = "MIT OR Apache-2.0" [dependencies] ## core -needletail = "0.5.1" +needletail = { version = "0.5.1", default-features = false, features = ["flate2"] } sbwt = "0.3.1" ## cli @@ -24,6 +24,14 @@ clap = { version = "4.4.18", features = ["derive"] } log = "0.4.20" stderrlog = "0.6.0" +[build-dependencies] +patch-crate = "0.1.10" + [dev-dependencies] -## tests -assert_approx_eq = "1.1.0" \ No newline at end of file +assert_approx_eq = "1.1.0" + +[package.metadata.patch] +crates = ["needletail"] + +[patch.crates-io] +needletail = { path = "./target/patch/needletail-0.5.1" } diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..d105d46 --- /dev/null +++ b/build.rs @@ -0,0 +1,4 @@ +fn main() { + println!("cargo:rerun-if-changed=Cargo.toml"); + patch_crate::run().expect("Failed while patching"); +} diff --git a/patches/needletail+0.5.1.patch b/patches/needletail+0.5.1.patch new file mode 100644 index 0000000..7c2c8b4 --- /dev/null +++ b/patches/needletail+0.5.1.patch @@ -0,0 +1,61 @@ +diff --git a/src/parser/mod.rs b/src/parser/mod.rs +index ada2e22..a1e77e9 100644 +--- a/src/parser/mod.rs ++++ b/src/parser/mod.rs +@@ -3,11 +3,11 @@ use std::fs::File; + use std::io::{stdin, Cursor, Read}; + use std::path::Path; + +-#[cfg(feature = "compression")] ++#[cfg(feature = "bzip2")] + use bzip2::read::BzDecoder; +-#[cfg(feature = "compression")] ++#[cfg(feature = "flate2")] + use flate2::read::MultiGzDecoder; +-#[cfg(feature = "compression")] ++#[cfg(feature = "xz2")] + use xz2::read::XzDecoder; + + use crate::errors::ParseError; +@@ -23,11 +23,11 @@ mod fastq; + pub use crate::parser::utils::FastxReader; + + // Magic bytes for each compression format +-#[cfg(feature = "compression")] ++#[cfg(feature = "flate2")] + const GZ_MAGIC: [u8; 2] = [0x1F, 0x8B]; +-#[cfg(feature = "compression")] ++#[cfg(feature = "bzip2")] + const BZ_MAGIC: [u8; 2] = [0x42, 0x5A]; +-#[cfg(feature = "compression")] ++#[cfg(feature = "xz2")] + const XZ_MAGIC: [u8; 2] = [0xFD, 0x37]; + + fn get_fastx_reader<'a, R: 'a + io::Read + Send>( +@@ -88,7 +88,7 @@ pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>( + let new_reader = first_two_cursor.chain(reader); + + match first_two_bytes { +- #[cfg(feature = "compression")] ++ #[cfg(feature = "flate2")] + GZ_MAGIC => { + let mut gz_reader = MultiGzDecoder::new(new_reader); + let mut first = [0; 1]; +@@ -96,7 +96,7 @@ pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>( + let r = Cursor::new(first).chain(gz_reader); + get_fastx_reader(r, first[0]) + } +- #[cfg(feature = "compression")] ++ #[cfg(feature = "bzip2")] + BZ_MAGIC => { + let mut bz_reader = BzDecoder::new(new_reader); + let mut first = [0; 1]; +@@ -104,7 +104,7 @@ pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>( + let r = Cursor::new(first).chain(bz_reader); + get_fastx_reader(r, first[0]) + } +- #[cfg(feature = "compression")] ++ #[cfg(feature = "xz2")] + XZ_MAGIC => { + let mut xz_reader = XzDecoder::new(new_reader); + let mut first = [0; 1]; From af3e2316b1d8e45dac52e0bbfd6f6c2a1865c6d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 21:53:07 +0300 Subject: [PATCH 054/103] Remove patch.crates-io (use build.rs script). --- Cargo.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d099836..d063d45 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,3 @@ assert_approx_eq = "1.1.0" [package.metadata.patch] crates = ["needletail"] - -[patch.crates-io] -needletail = { path = "./target/patch/needletail-0.5.1" } From 2a1f592c1fd16499d4bcea2c8355276ef4a0117a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 21:57:44 +0300 Subject: [PATCH 055/103] Revert "Remove patch.crates-io (use build.rs script)." This reverts commit af3e2316b1d8e45dac52e0bbfd6f6c2a1865c6d6. --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index d063d45..d099836 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,3 +32,6 @@ assert_approx_eq = "1.1.0" [package.metadata.patch] crates = ["needletail"] + +[patch.crates-io] +needletail = { path = "./target/patch/needletail-0.5.1" } From 48f4f1dd4ddab572e6776e7427fb39487e30ebab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 21:57:55 +0300 Subject: [PATCH 056/103] Revert "Patch needletail to respect compression features; disable xz,bzip2." This reverts commit f2fcf07e452f19b99b9215536683046e3fae5a6d. --- Cargo.toml | 14 ++------ build.rs | 4 --- patches/needletail+0.5.1.patch | 61 ---------------------------------- 3 files changed, 3 insertions(+), 76 deletions(-) delete mode 100644 build.rs delete mode 100644 patches/needletail+0.5.1.patch diff --git a/Cargo.toml b/Cargo.toml index d099836..86afc89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ license = "MIT OR Apache-2.0" [dependencies] ## core -needletail = { version = "0.5.1", default-features = false, features = ["flate2"] } +needletail = "0.5.1" sbwt = "0.3.1" ## cli @@ -24,14 +24,6 @@ clap = { version = "4.4.18", features = ["derive"] } log = "0.4.20" stderrlog = "0.6.0" -[build-dependencies] -patch-crate = "0.1.10" - [dev-dependencies] -assert_approx_eq = "1.1.0" - -[package.metadata.patch] -crates = ["needletail"] - -[patch.crates-io] -needletail = { path = "./target/patch/needletail-0.5.1" } +## tests +assert_approx_eq = "1.1.0" \ No newline at end of file diff --git a/build.rs b/build.rs deleted file mode 100644 index d105d46..0000000 --- a/build.rs +++ /dev/null @@ -1,4 +0,0 @@ -fn main() { - println!("cargo:rerun-if-changed=Cargo.toml"); - patch_crate::run().expect("Failed while patching"); -} diff --git a/patches/needletail+0.5.1.patch b/patches/needletail+0.5.1.patch deleted file mode 100644 index 7c2c8b4..0000000 --- a/patches/needletail+0.5.1.patch +++ /dev/null @@ -1,61 +0,0 @@ -diff --git a/src/parser/mod.rs b/src/parser/mod.rs -index ada2e22..a1e77e9 100644 ---- a/src/parser/mod.rs -+++ b/src/parser/mod.rs -@@ -3,11 +3,11 @@ use std::fs::File; - use std::io::{stdin, Cursor, Read}; - use std::path::Path; - --#[cfg(feature = "compression")] -+#[cfg(feature = "bzip2")] - use bzip2::read::BzDecoder; --#[cfg(feature = "compression")] -+#[cfg(feature = "flate2")] - use flate2::read::MultiGzDecoder; --#[cfg(feature = "compression")] -+#[cfg(feature = "xz2")] - use xz2::read::XzDecoder; - - use crate::errors::ParseError; -@@ -23,11 +23,11 @@ mod fastq; - pub use crate::parser::utils::FastxReader; - - // Magic bytes for each compression format --#[cfg(feature = "compression")] -+#[cfg(feature = "flate2")] - const GZ_MAGIC: [u8; 2] = [0x1F, 0x8B]; --#[cfg(feature = "compression")] -+#[cfg(feature = "bzip2")] - const BZ_MAGIC: [u8; 2] = [0x42, 0x5A]; --#[cfg(feature = "compression")] -+#[cfg(feature = "xz2")] - const XZ_MAGIC: [u8; 2] = [0xFD, 0x37]; - - fn get_fastx_reader<'a, R: 'a + io::Read + Send>( -@@ -88,7 +88,7 @@ pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>( - let new_reader = first_two_cursor.chain(reader); - - match first_two_bytes { -- #[cfg(feature = "compression")] -+ #[cfg(feature = "flate2")] - GZ_MAGIC => { - let mut gz_reader = MultiGzDecoder::new(new_reader); - let mut first = [0; 1]; -@@ -96,7 +96,7 @@ pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>( - let r = Cursor::new(first).chain(gz_reader); - get_fastx_reader(r, first[0]) - } -- #[cfg(feature = "compression")] -+ #[cfg(feature = "bzip2")] - BZ_MAGIC => { - let mut bz_reader = BzDecoder::new(new_reader); - let mut first = [0; 1]; -@@ -104,7 +104,7 @@ pub fn parse_fastx_reader<'a, R: 'a + io::Read + Send>( - let r = Cursor::new(first).chain(bz_reader); - get_fastx_reader(r, first[0]) - } -- #[cfg(feature = "compression")] -+ #[cfg(feature = "xz2")] - XZ_MAGIC => { - let mut xz_reader = XzDecoder::new(new_reader); - let mut first = [0; 1]; From 71a05e506fa07245de581a3477e1241edf884466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 22:03:22 +0300 Subject: [PATCH 057/103] Fix clippy suggestions. --- src/derandomize.rs | 8 ++++---- src/format.rs | 5 +++-- src/index.rs | 4 ++-- src/lib.rs | 11 +++++------ 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index a600af7..564145e 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -79,15 +79,15 @@ pub fn random_match_threshold( assert!(k > 0); assert!(n_kmers > 0); assert!(alphabet_size > 0); - assert!(max_error_prob <= 1 as f64); - assert!(max_error_prob > 0 as f64); + assert!(max_error_prob <= 1_f64); + assert!(max_error_prob > 0_f64); for i in 1..k { if log_rm_max_cdf(i, alphabet_size, n_kmers) > (-max_error_prob).ln_1p() { return i; } } - return k; + k } /// Derandomizes a single noisy _k_-bounded matching statistic. @@ -185,7 +185,7 @@ pub fn derandomize_ms_val( run = curr_noisy_ms as i64; } - return run; + run } /// Derandomizes a sequence of noisy _k_-bounded matching statistics. diff --git a/src/format.rs b/src/format.rs index 1bb2b61..d6e305c 100644 --- a/src/format.rs +++ b/src/format.rs @@ -11,8 +11,9 @@ // the MIT license, or , // at your option. // +//! Converting alignment representations into various output formats. pub fn run_lengths( - aln: &Vec, + aln: &[char], ) -> Vec<(usize, usize, usize, usize)> { // Store run lengths as Vec<(start, end, matches, mismatches)> let mut encodings: Vec<(usize, usize, usize, usize)> = Vec::new(); @@ -34,7 +35,7 @@ pub fn run_lengths( i += 1; } } - return encodings; + encodings } //////////////////////////////////////////////////////////////////////////////// diff --git a/src/index.rs b/src/index.rs index 63d1258..b8c72bd 100644 --- a/src/index.rs +++ b/src/index.rs @@ -166,7 +166,7 @@ pub fn build_sbwt_from_vecs( slices: &[Vec], build_options: &Option, ) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { - assert!(slices.len() > 0); + assert!(!slices.is_empty()); let build_opts = if build_options.is_some() { build_options.clone().unwrap() } else { BuildOpts::default() }; let temp_dir = if build_opts.temp_dir.is_some() { build_opts.temp_dir.unwrap() } else { std::env::temp_dir().to_str().unwrap().to_string() }; @@ -313,7 +313,7 @@ pub fn query_sbwt( sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec { - assert!(query.len() > 0); + assert!(!query.is_empty()); let ms = match sbwt { SbwtIndexVariant::SubsetMatrix(index) => { let streaming_index = sbwt::StreamingIndex::new(index, lcs); diff --git a/src/lib.rs b/src/lib.rs index 4f1ed48..78732ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -27,7 +27,7 @@ pub fn map( ) -> (Vec, Vec) { let (k, threshold) = match sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { - (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4 as usize, 0.0000001 as f64)) + (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, 0.0000001_f64)) }, }; // TODO handle multiple files and `input_list` @@ -37,16 +37,15 @@ pub fn map( let seqrec = rec.expect("invalid_record"); let seq_fwd = seqrec.normalize(true); - let ms_fwd = index::query_sbwt(seq_fwd.sequence(), &sbwt, &lcs); + let ms_fwd = index::query_sbwt(seq_fwd.sequence(), sbwt, lcs); let seq_rev = seq_fwd.reverse_complement(); - let ms_rev = index::query_sbwt(seq_rev.sequence(), &sbwt, &lcs); + let ms_rev = index::query_sbwt(seq_rev.sequence(), sbwt, lcs); info!("Translating result..."); let runs = (derandomize::derandomize_ms_vec(&ms_fwd, k, threshold), derandomize::derandomize_ms_vec(&ms_rev, k, threshold)); - let aln = (translate::translate_ms_vec(&runs.0, k, threshold), - translate::translate_ms_vec(&runs.1, k, threshold)); - return aln; + (translate::translate_ms_vec(&runs.0, k, threshold), + translate::translate_ms_vec(&runs.1, k, threshold)) } From ce30f614d14ef4600637f59e434d534dda25a480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Mon, 23 Sep 2024 22:23:26 +0300 Subject: [PATCH 058/103] Add examples and tests with recombination. --- src/translate.rs | 99 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 12 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index 77073c1..3f531cf 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -100,8 +100,9 @@ /// // Expected output : M,M,M, -,-,M,M,M /// /// translate_ms_val(-1, 0, 3, 2); +/// /// ``` -/// ## Query with a deletion or recombination +/// ## Query with a deletion /// ```rust /// use sablast::translate::translate_ms_val; /// @@ -119,18 +120,39 @@ /// ``` /// /// Although in this case two characters have been deleted from the -/// query, if the missing region was longer the matching statistics -/// (MS) could also represent recombination of a sequence from -/// elsewhere in the query into the position preceding the three +/// query, if the query extends beyond what is shown the matching +/// statistics (MS) could also represent recombination of a sequence +/// from elsewhere in the query into the position preceding the three /// consecutive T's in the reference. /// -/// Recombinations and deletions are indistinguishable in the -/// _k_-bounded matching statistics alone but they can be solved by -/// comparing the MS vector with the reference and -/// query. Additionally, when a segment of reasonable length is -/// encapsulated by two consecutive R's on both the left and right -/// side, the region in between possibly originates from elsewhere in -/// the reference. +/// ## Query with recombination +/// ```rust +/// use sablast::translate::translate_ms_val; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Ref sequence : A,C,G,T,T,T,C,G,G,C,C,C +/// // Query sequence : A,C,G,C,G,G,T,T,T,C,C,C +/// // +/// // Result MS vector : 1,2,3,1,2,3,3,3,3,1,2,3 +/// // Testing this pos : | +/// // Expected output : M,M,R,R,M,M,M,M,R,R,M,M +/// +/// translate_ms_val(3, 1, 3, 2); +/// ``` +/// +/// Note how the two regions with the consecutive 'R's are similar to +/// the Query with a deletion case. The first R,R pair is exactly the +/// same, while the second R,R pair is only different because the +/// match extends further to the left of it. +/// +/// When a segment of reasonable length is encapsulated by two +/// consecutive R's on both the left and right side, the region in +/// between possibly originates from elsewhere in the reference. +/// +/// In general recombinations and deletions are indistinguishable in +/// the _k_-bounded matching statistics alone but they can be solved +/// by comparing the MS vector with the reference and query. /// pub fn translate_ms_val( ms_curr: i64, @@ -180,10 +202,12 @@ pub fn translate_ms_val( /// underlying alignment. /// /// # Examples +/// ## Translate a generic MS vector /// ```rust /// use sablast::translate::translate_ms_vec; /// /// // Parameters : k = 3, threshold = 2 +/// // /// // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G /// // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C /// // Input MS : 0,1,2,3, 1,2,3,0,1,2,3,-1,0,1,2,3,-1,0 @@ -193,6 +217,22 @@ pub fn translate_ms_val( /// translate_ms_vec(&input, 3, 2); /// ``` /// +/// ## Translate a MS vector with recombination +/// ```rust +/// use sablast::translate::translate_ms_vec; +/// +/// // Parameters : k = 3, threshold = 2 +/// // +/// // Ref sequence : A,C,G,T,T,T,C,G,G,C,C,C +/// // Query sequence : A,C,G,C,G,G,T,T,T,C,C,C +/// // +/// // Result MS vector : 1,2,3,1,2,3,3,3,3,1,2,3 +/// // Expected output : M,M,R,R,M,M,M,M,R,R,M,M +/// +/// let input: Vec = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0]; +/// translate_ms_vec(&input, 3, 2); +/// ``` +/// pub fn translate_ms_vec( derand_ms: &[i64], k: usize, @@ -233,7 +273,7 @@ mod tests { // Test cases for translate_ms_val // Comments use '-' for characters that are not in a ref or query sequence #[test] - fn translate_ms_val_with_discontinuity() { + fn translate_ms_val_with_deletion() { // Parameters : k = 3, threshold = 2 // // Ref sequence : A,C,G,T,T,T,C,A,G @@ -249,6 +289,23 @@ mod tests { assert_eq!(got, expected); } + #[test] + fn translate_ms_val_with_recombination() { + // Parameters : k = 3, threshold = 2 + // + // Ref sequence : A,C,G,T,T,T,C,G,G,C,C,C + // Query sequence : A,C,G,C,G,G,T,T,T,C,C,C + // + // Result MS vector : 1,2,3,1,2,3,3,3,3,1,2,3 + // Testing this pos : | + // Expected output : M,M,R,R,M,M,M,M,R,R,M,M + + let expected = ('R','R'); + let got = super::translate_ms_val(3, 1, 3, 2); + + assert_eq!(got, expected); + } + #[test] fn translate_ms_val_with_mismatch() { // Parameters : k = 3, threshold = 2 @@ -323,6 +380,7 @@ mod tests { #[test] fn translate_ms_vec() { // Parameters : k = 3, threshold = 2 + // // Ref sequence : A,A,A,G,A,A,C,C,A,-,T,C,A, -,-,G,G,G, C,G // Query sequence : C,A,A,G,-,-,C,C,A,C,T,C,A, T,T,G,G,G, T,C // @@ -335,4 +393,21 @@ mod tests { assert_eq!(got, expected); } + + #[test] + fn translate_ms_vec_with_recombination() { + // Parameters : k = 3, threshold = 2 + // + // Ref sequence : A,C,G,T,T,T,C,G,G,C,C,C + // Query sequence : A,C,G,C,G,G,T,T,T,C,C,C + // + // Result MS vector : 1,2,3,1,2,3,3,3,3,1,2,3 + // Expected output : M,M,R,R,M,M,M,M,R,R,M,M + + let input: Vec = vec![1,2,3,1,2,3,3,3,3,1,2,3]; + let expected: Vec = vec!['M','M','R','R','M','M','M','M','R','R','M','M']; + let got = super::translate_ms_vec(&input, 3, 2); + + assert_eq!(got, expected); + } } From f8f75a456d46b901b96212de6bebf025bbfc3a34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 10:37:50 +0300 Subject: [PATCH 059/103] Handle multiple files in seq_files by indexing them all at once. --- src/main.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 164ff01..ed6c633 100644 --- a/src/main.rs +++ b/src/main.rs @@ -53,10 +53,24 @@ fn main() { }; // TODO Handle --input-list in sablast build - // TODO Handle multiple inputs in sablast build + let mut inputs = seq_files.clone(); + info!("Reading input files..."); + let mut seqs: Vec> = Vec::new(); + inputs.iter().for_each(|file| { + let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); + loop { + let rec = reader.next(); + match rec { + Some(Ok(seqrec)) => { + seqs.push(seqrec.normalize(true).as_ref().to_vec()); + }, + _ => break + } + } + }); info!("Building SBWT index..."); - let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&seq_files[0], &Some(sbwt_build_options)); + let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&seqs, &Some(sbwt_build_options)); info!("Serializing SBWT index..."); sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs); From 62a897e5c662f4958038e775b4dab5d8e1628db8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 10:51:46 +0300 Subject: [PATCH 060/103] Remove --input list from build and index everything to same file. --- src/cli.rs | 4 ---- src/main.rs | 6 ++---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 185c60b..a9b098b 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -29,10 +29,6 @@ pub enum Commands { #[arg(group = "input", required = true)] seq_files: Vec, - // Input sequence list - #[arg(short = 'l', long = "input-list", group = "input", required = true, help_heading = "Input")] - input_list: Option, - // Outputs #[arg(short = 'o', long = "out-prefix", required = false, help_heading = "Output")] output_prefix: Option, diff --git a/src/main.rs b/src/main.rs index ed6c633..a440584 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,6 +13,7 @@ // use clap::Parser; use log::info; +use needletail::Sequence; // Command-line interface mod cli; @@ -36,7 +37,6 @@ fn main() { // Run the full pipeline Some(cli::Commands::Build { seq_files, - input_list, output_prefix, num_threads, mem_gb, @@ -51,12 +51,10 @@ fn main() { temp_dir: temp_dir.clone(), ..Default::default() }; - // TODO Handle --input-list in sablast build - let mut inputs = seq_files.clone(); info!("Reading input files..."); let mut seqs: Vec> = Vec::new(); - inputs.iter().for_each(|file| { + seq_files.iter().for_each(|file| { let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); loop { let rec = reader.next(); From 158864457fc736670403741c55fff425624afb2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 11:04:23 +0300 Subject: [PATCH 061/103] Move sablast build implementation to lib.rs. --- src/lib.rs | 21 +++++++++++++++++++++ src/main.rs | 24 ++++-------------------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 78732ac..61d1f4d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,6 +20,27 @@ pub mod format; pub mod index; pub mod translate; +pub fn build( + seq_files: &Vec, + build_opts: index::BuildOpts, +) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { + let mut seq_data: Vec> = Vec::new(); + seq_files.iter().for_each(|file| { + let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); + loop { + let rec = reader.next(); + match rec { + Some(Ok(seqrec)) => { + seq_data.push(seqrec.normalize(true).as_ref().to_vec()); + }, + _ => break + } + } + }); + + index::build_sbwt_from_vecs(&seq_data, &Some(build_opts)) +} + pub fn map( query_file: &String, sbwt: &sbwt::SbwtIndexVariant, diff --git a/src/main.rs b/src/main.rs index a440584..2695003 100644 --- a/src/main.rs +++ b/src/main.rs @@ -13,11 +13,11 @@ // use clap::Parser; use log::info; -use needletail::Sequence; // Command-line interface mod cli; +/// Initializes the logger with verbosity given in `log_max_level`. fn init_log(log_max_level: usize) { stderrlog::new() .module(module_path!()) @@ -28,13 +28,12 @@ fn init_log(log_max_level: usize) { .unwrap(); } -// Use `sablast` to list the available commands or `sablast ` to run. +/// Use `sablast` to list the available commands or `sablast ` to run. fn main() { let cli = cli::Cli::parse(); // Subcommands: match &cli.command { - // Run the full pipeline Some(cli::Commands::Build { seq_files, output_prefix, @@ -52,23 +51,8 @@ fn main() { ..Default::default() }; - info!("Reading input files..."); - let mut seqs: Vec> = Vec::new(); - seq_files.iter().for_each(|file| { - let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); - loop { - let rec = reader.next(); - match rec { - Some(Ok(seqrec)) => { - seqs.push(seqrec.normalize(true).as_ref().to_vec()); - }, - _ => break - } - } - }); - - info!("Building SBWT index..."); - let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&seqs, &Some(sbwt_build_options)); + info!("Building SBWT index from {} files...", seq_files.len()); + let (sbwt, lcs) = sablast::build(seq_files, sbwt_build_options); info!("Serializing SBWT index..."); sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs); From 249743ea19267e4dbd111433f49edf9e4533f249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 11:31:16 +0300 Subject: [PATCH 062/103] Add documentation to build. --- src/lib.rs | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 61d1f4d..e2a4c9e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,8 +20,54 @@ pub mod format; pub mod index; pub mod translate; +/// Builds an SBWT index from some fasta or fastq files. +/// +/// Reads all sequence data in `seq_files` and builds an SBWT index +/// with the parameters and resources specified in `build_opts` (see +/// [index::BuildOpts] for details). +/// +/// All files and sequence data in `seq_files` are merged into the +/// same index. It is not possible extract the individual sequences +/// from the index after it has been built; use [map] with the TODO +/// options instead if you need to know which reference sequences the +/// alignments are for. +/// +/// TODO Describe map syntax in lib.rs documentation. +/// +/// Returns a tuple containing the built +/// [sbwt::SbwtIndexVariant](https://docs.rs/sbwt/latest/sbwt/enum.SbwtIndexVariant.html) +/// and +/// [sbwt::LcsArray](https://docs.rs/sbwt/latest/sbwt/struct.LcsArray.html). +/// +/// Panics if a file in `seq_files` is not readable or a valid FASTX +/// file. +/// +/// # Input format detection +/// The sequence data is read using +/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). +/// +/// Input file format (fasta or fastq) is detected automatically and +/// the files may be compressed in a +/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz +/// files). +/// +/// [Bzip2](https://sourceware.org/bzip2/) and +/// [liblzma](https://tukaani.org/xz/) compression (.bz2 and .xz +/// files) can be enabled using the needletail features field in +/// sablast Cargo.toml if compiling from source. +/// +/// # Examples +/// ```rust +/// use sablast::build; +/// use sablast::index::BuildOpts; +/// +/// let inputs = vec!["tests/data/clbS.fna.gz".to_string(), "tests/data/NZ_CP058217.1_clbS.fna.gz".to_string()]; +/// +/// let (sbwt_index, lcs_array) = build(&inputs, BuildOpts::default()); +/// ``` +/// pub fn build( - seq_files: &Vec, + seq_files: &[String], build_opts: index::BuildOpts, ) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { let mut seq_data: Vec> = Vec::new(); From 59be17c21a6a41e3a05ae33adb676b845cae80f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 11:31:22 +0300 Subject: [PATCH 063/103] Minor fixes. --- src/main.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 2695003..541d42e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -54,8 +54,9 @@ fn main() { info!("Building SBWT index from {} files...", seq_files.len()); let (sbwt, lcs) = sablast::build(seq_files, sbwt_build_options); - info!("Serializing SBWT index..."); - sablast::index::serialize_sbwt(&output_prefix.as_ref().unwrap(), &sbwt, &lcs); + info!("Serializing SBWT index to {}.sbwt ...", output_prefix.as_ref().unwrap()); + info!("Serializing LCS array to {}.lcs ...", output_prefix.as_ref().unwrap()); + sablast::index::serialize_sbwt(output_prefix.as_ref().unwrap(), &sbwt, &lcs); }, Some(cli::Commands::Map { From 98193b6c2b030c9d800dd77d73f0b6ccca589c5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 13:06:49 +0300 Subject: [PATCH 064/103] Implement multiple queries in map and parallelise over input files. --- Cargo.toml | 5 +++-- src/cli.rs | 18 ++++++++++-------- src/main.rs | 38 ++++++++++++++++++++++++-------------- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 86afc89..ae278c6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,10 +15,11 @@ license = "MIT OR Apache-2.0" [dependencies] ## core needletail = "0.5.1" +rayon = "1" sbwt = "0.3.1" ## cli -clap = { version = "4.4.18", features = ["derive"] } +clap = { version = "4", features = ["derive"] } ## logging log = "0.4.20" @@ -26,4 +27,4 @@ stderrlog = "0.6.0" [dev-dependencies] ## tests -assert_approx_eq = "1.1.0" \ No newline at end of file +assert_approx_eq = "1" \ No newline at end of file diff --git a/src/cli.rs b/src/cli.rs index a9b098b..7d04b83 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -34,34 +34,36 @@ pub enum Commands { output_prefix: Option, // Resources - // Threads + // // Threads #[arg(short = 't', long = "threads", default_value_t = 1)] num_threads: usize, - // Memory in GB + // // Memory in GB #[arg(short = 'm', long = "memory", default_value_t = 4)] mem_gb: usize, - // Temporary directory + // // Temporary directory #[arg(long = "tmp-dir", required = false)] temp_dir: Option, + // Verbosity #[arg(long = "verbose", default_value_t = false)] verbose: bool, }, - // Map query against SBWT index + // Find indexed k-mers in a query Map { // Input fasta or fastq query file(s) #[arg(group = "input", required = true)] seq_files: Vec, - // Input sequence list - #[arg(short = 'l', long = "input-list", group = "input", required = true, help_heading = "Input")] - input_list: Option, - // Index name #[arg(short = 'i', long = "index", required = true, help_heading = "Index")] index_prefix: Option, + // Resources + // // Threads + #[arg(short = 't', long = "threads", default_value_t = 1)] + num_threads: usize, + // Verbosity #[arg(long = "verbose", default_value_t = false)] verbose: bool, diff --git a/src/main.rs b/src/main.rs index 541d42e..d05a930 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,8 +11,12 @@ // the MIT license, or , // at your option. // +use std::io::Write; + use clap::Parser; use log::info; +use rayon::iter::ParallelIterator; +use rayon::iter::IntoParallelRefIterator; // Command-line interface mod cli; @@ -61,29 +65,35 @@ fn main() { }, Some(cli::Commands::Map { seq_files, - input_list, index_prefix, + num_threads, verbose, }) => { init_log(if *verbose { 2 } else { 1 }); - info!("Loading SBWT index..."); + rayon::ThreadPoolBuilder::new() + .num_threads(*num_threads) + .thread_name(|i| format!("rayon-thread-{}", i)) + .build_global() + .unwrap(); + info!("Loading SBWT index..."); let (sbwt, lcs) = sablast::index::load_sbwt(index_prefix.as_ref().unwrap()); - // TODO Handle `--input-list in sablast map - - // TODO Query multiple inputs in sablast map info!("Querying SBWT index..."); - - let aln = sablast::map(&seq_files[0], &sbwt, &lcs); - let mut run_lengths: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); - let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); - run_lengths.append(&mut run_lengths_rev); - - run_lengths.sort_by_key(|x| x.0); - println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches"); - run_lengths.iter().for_each(|x| println!("{}\t{}\t{}\t{}\t{}\t{}\t{}", &seq_files[0], &index_prefix.clone().unwrap(), x.0, x.1, x.4, x.2 + x.3, x.3)); + seq_files.par_iter().for_each(|file| { + let aln = sablast::map(file, &sbwt, &lcs); + let mut run_lengths: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); + let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); + run_lengths.append(&mut run_lengths_rev); + run_lengths.sort_by_key(|x| x.0); + run_lengths.iter().for_each(|x| { + let stdout = std::io::stdout(); + let _ = writeln!(&mut stdout.lock(), + "{}\t{}\t{}\t{}\t{}\t{}\t{}", + file, index_prefix.as_ref().unwrap(), x.0, x.1, x.4, x.2 + x.3, x.3); + }); + }); }, None => {} } From 0c1df9746a16c3fd18055ed55fa652765e7db77a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 13:24:13 +0300 Subject: [PATCH 065/103] Move implementation of sablast map to find() in lib.rs. --- src/lib.rs | 16 ++++++++++++++++ src/main.rs | 11 +++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e2a4c9e..45433e5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -116,3 +116,19 @@ pub fn map( (translate::translate_ms_vec(&runs.0, k, threshold), translate::translate_ms_vec(&runs.1, k, threshold)) } + +pub fn find( + query_file: &String, + sbwt: &sbwt::SbwtIndexVariant, + lcs: &sbwt::LcsArray, +) -> Vec<(usize, usize, char, usize, usize)> { + let aln = map(query_file, &sbwt, &lcs); + + let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + let mut run_lengths_rev: Vec<(usize, usize, char, usize, usize)> = format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + + run_lengths.append(&mut run_lengths_rev); + run_lengths.sort_by_key(|x| x.0); + + run_lengths +} diff --git a/src/main.rs b/src/main.rs index d05a930..5fa82c1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -82,16 +82,15 @@ fn main() { info!("Querying SBWT index..."); println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches"); seq_files.par_iter().for_each(|file| { - let aln = sablast::map(file, &sbwt, &lcs); - let mut run_lengths: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); - let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); - run_lengths.append(&mut run_lengths_rev); - run_lengths.sort_by_key(|x| x.0); + // Get local alignments + let run_lengths = sablast::find(file, &sbwt, &lcs); + + // Print results with query and ref name added run_lengths.iter().for_each(|x| { let stdout = std::io::stdout(); let _ = writeln!(&mut stdout.lock(), "{}\t{}\t{}\t{}\t{}\t{}\t{}", - file, index_prefix.as_ref().unwrap(), x.0, x.1, x.4, x.2 + x.3, x.3); + file, index_prefix.as_ref().unwrap(), x.0, x.1, x.2, x.3, x.4); }); }); }, From 9ee2e685ac15757f23dd47cca3fd4e17d155d327 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 13:24:37 +0300 Subject: [PATCH 066/103] Rename sablast map to sablast find. --- src/cli.rs | 2 +- src/main.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index 7d04b83..8430cbf 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -50,7 +50,7 @@ pub enum Commands { }, // Find indexed k-mers in a query - Map { + Find { // Input fasta or fastq query file(s) #[arg(group = "input", required = true)] seq_files: Vec, diff --git a/src/main.rs b/src/main.rs index 5fa82c1..041c45f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -63,7 +63,7 @@ fn main() { sablast::index::serialize_sbwt(output_prefix.as_ref().unwrap(), &sbwt, &lcs); }, - Some(cli::Commands::Map { + Some(cli::Commands::Find { seq_files, index_prefix, num_threads, From fccc315e43dbd7f1e7c28f8c794b7741ca2b9597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 13:40:09 +0300 Subject: [PATCH 067/103] Take &str instead of &String. --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 45433e5..d934aaf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,7 +88,7 @@ pub fn build( } pub fn map( - query_file: &String, + query_file: &str, sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> (Vec, Vec) { @@ -118,7 +118,7 @@ pub fn map( } pub fn find( - query_file: &String, + query_file: &str, sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec<(usize, usize, char, usize, usize)> { From 127fe64c46120a8340e094aa365b94daf39f43c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 13:40:18 +0300 Subject: [PATCH 068/103] Add documentation to find. --- src/lib.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index d934aaf..5eb2756 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,6 +117,44 @@ pub fn map( translate::translate_ms_vec(&runs.1, k, threshold)) } +/// Finds the _k_-mers from an SBWT index in a query fasta or fastq file. +/// +/// Aligns the sequence data and its reverse complement in `query` +/// against the SBWT index `sbwt` and its LCS array `lcs` using +/// [map]. Then uses [format::run_lengths] to extract the local +/// alignments from the matching statistics. +/// +/// Returns a vector of tuples, where each element represents a local +/// alignment block and contains the following values: +/// 1. Start of local alignment block in query (1-based indexing). +/// 2. End of local alignment block in query. +/// 3. Whether local alignmentis on the original sequence ('+') or its reverse complement ('-'). +/// 4. Total length of the local alignment block. +/// 5. Number of mismatching characters and 1-character insertions in the block. +/// +/// # Input format detection +/// The sequence data is read using +/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). +/// +/// Input file format (fasta or fastq) is detected automatically and +/// the files may be compressed in a +/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz +/// files). +/// +/// # Examples +/// ```rust +/// use sablast::build; +/// use sablast::find; +/// use sablast::index::BuildOpts; +/// +/// let reference = vec!["tests/data/clbS.fna.gz".to_string()]; +/// let (sbwt, lcs) = build(&reference, BuildOpts::default()); +/// +/// let query_file = "tests/data/NZ_CP058217.1_clbS.fna.gz"; +/// +/// let local_alignments = find(query_file, &sbwt, &lcs); +/// ``` +/// pub fn find( query_file: &str, sbwt: &sbwt::SbwtIndexVariant, From a7d046ef2d447b86ed74fe0dd3aab0d6ac5f26bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 19:57:56 +0300 Subject: [PATCH 069/103] Rename map to matches. --- src/lib.rs | 9 ++++----- tests/map_clbs.rs | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5eb2756..23b511b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,7 +28,7 @@ pub mod translate; /// /// All files and sequence data in `seq_files` are merged into the /// same index. It is not possible extract the individual sequences -/// from the index after it has been built; use [map] with the TODO +/// from the index after it has been built; use [matches] with the TODO /// options instead if you need to know which reference sequences the /// alignments are for. /// @@ -87,7 +87,7 @@ pub fn build( index::build_sbwt_from_vecs(&seq_data, &Some(build_opts)) } -pub fn map( +pub fn matches( query_file: &str, sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, @@ -97,7 +97,6 @@ pub fn map( (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, 0.0000001_f64)) }, }; - // TODO handle multiple files and `input_list` let mut reader = needletail::parse_fastx_file(query_file).expect("valid path/file"); let Some(rec) = reader.next() else { panic!("Invalid query {}", query_file); }; @@ -121,7 +120,7 @@ pub fn map( /// /// Aligns the sequence data and its reverse complement in `query` /// against the SBWT index `sbwt` and its LCS array `lcs` using -/// [map]. Then uses [format::run_lengths] to extract the local +/// [matches]. Then uses [format::run_lengths] to extract the local /// alignments from the matching statistics. /// /// Returns a vector of tuples, where each element represents a local @@ -160,7 +159,7 @@ pub fn find( sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec<(usize, usize, char, usize, usize)> { - let aln = map(query_file, &sbwt, &lcs); + let aln = matches(query_file, sbwt, lcs); let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); let mut run_lengths_rev: Vec<(usize, usize, char, usize, usize)> = format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 25a392c..eb4c33a 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -16,7 +16,7 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); let expected = vec![(455, 967, 512, 1, '+')]; - let aln = sablast::map(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); + let aln = sablast::matches(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); From af8c9fa1e52d5dcfd2827728e43e445bc8fb28e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 19:58:06 +0300 Subject: [PATCH 070/103] Document matches. --- src/lib.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 23b511b..be019df 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,6 +87,49 @@ pub fn build( index::build_sbwt_from_vecs(&seq_data, &Some(build_opts)) } +/// Matches a query fasta or fastq file against an SBWT index. +/// +/// Reads the sequence data in `query_file` and matches it and its +/// reverse complement against the SBWT index `sbwt` and its LCS array +/// `lcs` using [index::query_sbwt]. Then, derandomizes the resulting +/// _k_-bounded matching statistics vector using +/// [derandomize::derandomize_ms_vec] and translates the matching +/// statistics to a character representation of the alignment using +/// [translate::translate_ms_vec]. +/// +/// Returns a tuple of two vectors, the first one containing the +/// character representation of the alignment for the canonical strand +/// of the query sequence and the second for its reverse complement. +/// +/// Panics if the query file is not readable or if it's not a valid +/// FASTX file. +/// +/// # Output format +/// See the documentation for [translate]. +/// +/// # Input format detection +/// The sequence data is read using +/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). +/// +/// Input file format (fasta or fastq) is detected automatically and +/// the files may be compressed in a +/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz +/// files). +/// +/// # Example +/// ```rust +/// use sablast::build; +/// use sablast::matches; +/// use sablast::index::BuildOpts; +/// +/// let reference = vec!["tests/data/clbS.fna.gz".to_string()]; +/// let (sbwt, lcs) = build(&reference, BuildOpts::default()); +/// +/// let query_file = "tests/data/NZ_CP058217.1_clbS.fna.gz"; +/// +/// let ms_vectors = matches(query_file, &sbwt, &lcs); +/// ``` +/// pub fn matches( query_file: &str, sbwt: &sbwt::SbwtIndexVariant, From 171b52d30e3fa4059026d3c4e5f35efdaab504e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 19:58:17 +0300 Subject: [PATCH 071/103] Fix typos. --- src/derandomize.rs | 2 +- src/translate.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index 564145e..dcd3d75 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -190,7 +190,7 @@ pub fn derandomize_ms_val( /// Derandomizes a sequence of noisy _k_-bounded matching statistics. /// -/// Iterates over a sequence of noisy _k_bounded matching statistics +/// Iterates over a sequence of noisy _k_-bounded matching statistics /// `ms` in reverse to identify values that are the result of random /// matching between _k_-mers of size `k` and an index that the lower /// bound `threshold` was calculated for. diff --git a/src/translate.rs b/src/translate.rs index 3f531cf..be4b00d 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -194,7 +194,7 @@ pub fn translate_ms_val( /// Translates a sequence of derandomized _k_-bounded matching statistics. /// -/// Iterates over a derandomized sequence of _k_bounded matching +/// Iterates over a derandomized sequence of _k_-bounded matching /// statistics `derand_ms` for _k_-mers with size `k` derandomized /// with the threshold `threshold`. /// From 73cfef3611c3c5b13e386aa5e8894e9f8d2f20fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 19:59:51 +0300 Subject: [PATCH 072/103] Use matches in the test instead of map. --- tests/map_clbs.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index eb4c33a..09539aa 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -15,12 +15,8 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); - let expected = vec![(455, 967, 512, 1, '+')]; - let aln = sablast::matches(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); - - let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); - let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); - got.append(&mut run_lengths_rev); + let expected = vec![(455, 967, '+', 513, 1)]; + let got = sablast::find(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); assert_eq!(got, expected); } From 29facacaf1becdeb81f2f699ee2375987ed4575e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 20:01:13 +0300 Subject: [PATCH 073/103] Revert "Use matches in the test instead of map." This reverts commit 73cfef3611c3c5b13e386aa5e8894e9f8d2f20fd. --- tests/map_clbs.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 09539aa..eb4c33a 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -15,8 +15,12 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); - let expected = vec![(455, 967, '+', 513, 1)]; - let got = sablast::find(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); + let expected = vec![(455, 967, 512, 1, '+')]; + let aln = sablast::matches(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); + + let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); + let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); + got.append(&mut run_lengths_rev); assert_eq!(got, expected); } From 221958de240ef15a49bbb783627c4918acbb5812 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Tue, 24 Sep 2024 20:01:42 +0300 Subject: [PATCH 074/103] Call map via the new find syntax. --- tests/map_clbs.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index eb4c33a..09539aa 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -15,12 +15,8 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); - let expected = vec![(455, 967, 512, 1, '+')]; - let aln = sablast::matches(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); - - let mut got: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, x.2, x.3, '+')).collect(); - let mut run_lengths_rev: Vec<(usize, usize, usize, usize, char)> = sablast::format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, x.2, x.3, '-')).collect(); - got.append(&mut run_lengths_rev); + let expected = vec![(455, 967, '+', 513, 1)]; + let got = sablast::find(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); assert_eq!(got, expected); } From dc281f91fbe675a6570bd66c6cc86665184d1c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 12:27:46 +0300 Subject: [PATCH 075/103] Take &[u8] in find, matches and handle input reading elsewhere. --- src/lib.rs | 39 ++++++++++----------------------------- src/main.rs | 9 ++++++++- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index be019df..81db24a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,7 +117,7 @@ pub fn build( /// files). /// /// # Example -/// ```rust +/// ```compile_fail /// use sablast::build; /// use sablast::matches; /// use sablast::index::BuildOpts; @@ -131,32 +131,20 @@ pub fn build( /// ``` /// pub fn matches( - query_file: &str, + query_seq: &[u8], sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, -) -> (Vec, Vec) { +) -> Vec { let (k, threshold) = match sbwt { SbwtIndexVariant::SubsetMatrix(ref sbwt) => { (sbwt.k(), derandomize::random_match_threshold(sbwt.k(), sbwt.n_kmers(), 4_usize, 0.0000001_f64)) }, }; - let mut reader = needletail::parse_fastx_file(query_file).expect("valid path/file"); - let Some(rec) = reader.next() else { panic!("Invalid query {}", query_file); }; - let seqrec = rec.expect("invalid_record"); - - let seq_fwd = seqrec.normalize(true); - let ms_fwd = index::query_sbwt(seq_fwd.sequence(), sbwt, lcs); - - let seq_rev = seq_fwd.reverse_complement(); - let ms_rev = index::query_sbwt(seq_rev.sequence(), sbwt, lcs); - - info!("Translating result..."); - let runs = (derandomize::derandomize_ms_vec(&ms_fwd, k, threshold), - derandomize::derandomize_ms_vec(&ms_rev, k, threshold)); + let noisy_ms = index::query_sbwt(query_seq, sbwt, lcs); + let derand_ms = derandomize::derandomize_ms_vec(&noisy_ms, k, threshold); - (translate::translate_ms_vec(&runs.0, k, threshold), - translate::translate_ms_vec(&runs.1, k, threshold)) + translate::translate_ms_vec(&derand_ms, k, threshold) } /// Finds the _k_-mers from an SBWT index in a query fasta or fastq file. @@ -184,7 +172,7 @@ pub fn matches( /// files). /// /// # Examples -/// ```rust +/// ```compile_fail /// use sablast::build; /// use sablast::find; /// use sablast::index::BuildOpts; @@ -198,17 +186,10 @@ pub fn matches( /// ``` /// pub fn find( - query_file: &str, + query_seq: &[u8], sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, ) -> Vec<(usize, usize, char, usize, usize)> { - let aln = matches(query_file, sbwt, lcs); - - let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = format::run_lengths(&aln.0).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); - let mut run_lengths_rev: Vec<(usize, usize, char, usize, usize)> = format::run_lengths(&aln.1).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); - - run_lengths.append(&mut run_lengths_rev); - run_lengths.sort_by_key(|x| x.0); - - run_lengths + let aln = matches(query_seq, sbwt, lcs); + format::run_lengths(&aln).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect() } diff --git a/src/main.rs b/src/main.rs index 041c45f..00f356e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,6 +15,7 @@ use std::io::Write; use clap::Parser; use log::info; +use needletail::Sequence; use rayon::iter::ParallelIterator; use rayon::iter::IntoParallelRefIterator; @@ -82,8 +83,14 @@ fn main() { info!("Querying SBWT index..."); println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches"); seq_files.par_iter().for_each(|file| { + + let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); + let Some(rec) = reader.next() else { panic!("Invalid query {}", file); }; + let seqrec = rec.expect("Valid fastX record"); + let seq = seqrec.normalize(true); + // Get local alignments - let run_lengths = sablast::find(file, &sbwt, &lcs); + let run_lengths = sablast::find(&seq, &sbwt, &lcs); // Print results with query and ref name added run_lengths.iter().for_each(|x| { From 6bcc2e346139c6655ae49976d121f1d2d66668d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 12:28:16 +0300 Subject: [PATCH 076/103] Update test --- tests/map_clbs.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 09539aa..76872ed 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -13,10 +13,18 @@ // #[test] fn map_nissle_against_clbs() { + use needletail::Sequence; + let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); let expected = vec![(455, 967, '+', 513, 1)]; - let got = sablast::find(&"tests/data/NZ_CP058217.1_clbS.fna.gz".to_string(), &sbwt, &lcs); + + let mut reader = needletail::parse_fastx_file("tests/data/NZ_CP058217.1_clbS.fna.gz".to_string()).expect("valid path/file"); + let Some(rec) = reader.next() else { panic!("Couldn't read from tests/data/NZ_CP058217.1_clbS.fna.gz") }; + let seqrec = rec.expect("Valid fastX record"); + let seq = seqrec.normalize(true); + + let got = sablast::find(&seq, &sbwt, &lcs); assert_eq!(got, expected); } From f5697306d7cbc770abe6bbe7b96fea4195da99c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:17:06 +0300 Subject: [PATCH 077/103] Handle fragmented queries in sablast find. --- src/main.rs | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/main.rs b/src/main.rs index 00f356e..35e36b0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -81,24 +81,26 @@ fn main() { let (sbwt, lcs) = sablast::index::load_sbwt(index_prefix.as_ref().unwrap()); info!("Querying SBWT index..."); - println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches"); + println!("query\tref\tq.start\tq.end\tstrand\tlength\tmismatches\tin.contig"); seq_files.par_iter().for_each(|file| { let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); - let Some(rec) = reader.next() else { panic!("Invalid query {}", file); }; - let seqrec = rec.expect("Valid fastX record"); - let seq = seqrec.normalize(true); - - // Get local alignments - let run_lengths = sablast::find(&seq, &sbwt, &lcs); - - // Print results with query and ref name added - run_lengths.iter().for_each(|x| { - let stdout = std::io::stdout(); - let _ = writeln!(&mut stdout.lock(), - "{}\t{}\t{}\t{}\t{}\t{}\t{}", - file, index_prefix.as_ref().unwrap(), x.0, x.1, x.2, x.3, x.4); - }); + while let Some(rec) = reader.next() { + let seqrec = rec.expect("Valid fastX record"); + let seq = seqrec.normalize(true); + let contig = seqrec.id(); + + // Get local alignments + let run_lengths = sablast::find(&seq, &sbwt, &lcs); + + // Print results with query and ref name added + run_lengths.iter().for_each(|x| { + let stdout = std::io::stdout(); + let _ = writeln!(&mut stdout.lock(), + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}", + file, index_prefix.as_ref().unwrap(), x.0, x.1, x.2, x.3, x.4, std::str::from_utf8(contig).expect("UTF-8")); + }); + } }); }, None => {} From 872b97aee261ad18dd2b84f3134af88835500851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:27:44 +0300 Subject: [PATCH 078/103] Don't add strand in find. --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 81db24a..b218475 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -189,7 +189,7 @@ pub fn find( query_seq: &[u8], sbwt: &sbwt::SbwtIndexVariant, lcs: &sbwt::LcsArray, -) -> Vec<(usize, usize, char, usize, usize)> { +) -> Vec<(usize, usize, usize, usize)> { let aln = matches(query_seq, sbwt, lcs); - format::run_lengths(&aln).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect() + format::run_lengths(&aln) } From f2a69b2d6f44e507747a2ec691ffd5f5c35ce29c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:27:51 +0300 Subject: [PATCH 079/103] Add back reverse complements to sablast find. --- src/main.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 35e36b0..99f87d3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -87,11 +87,17 @@ fn main() { let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); while let Some(rec) = reader.next() { let seqrec = rec.expect("Valid fastX record"); - let seq = seqrec.normalize(true); let contig = seqrec.id(); + let seq = seqrec.normalize(true); + + // Get local alignments for forward strand + let mut run_lengths: Vec<(usize, usize, char, usize, usize)> = sablast::find(&seq, &sbwt, &lcs).iter().map(|x| (x.0, x.1, '+', x.2 + x.3, x.3)).collect(); + + // Add local alignments for reverse _complement + run_lengths.append(&mut sablast::find(&seq.reverse_complement(), &sbwt, &lcs).iter().map(|x| (x.0, x.1, '-', x.2 + x.3, x.3)).collect()); - // Get local alignments - let run_lengths = sablast::find(&seq, &sbwt, &lcs); + // Sort by q.start + run_lengths.sort_by_key(|x| x.0); // Print results with query and ref name added run_lengths.iter().for_each(|x| { From f0063d8f030aa9ec3885676ccae159f8c41b809f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:46:13 +0300 Subject: [PATCH 080/103] Update documentation --- src/lib.rs | 63 ++++++++++++++++++------------------------------------ 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b218475..5f3e1b2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -89,17 +89,15 @@ pub fn build( /// Matches a query fasta or fastq file against an SBWT index. /// -/// Reads the sequence data in `query_file` and matches it and its -/// reverse complement against the SBWT index `sbwt` and its LCS array -/// `lcs` using [index::query_sbwt]. Then, derandomizes the resulting -/// _k_-bounded matching statistics vector using -/// [derandomize::derandomize_ms_vec] and translates the matching -/// statistics to a character representation of the alignment using -/// [translate::translate_ms_vec]. -/// -/// Returns a tuple of two vectors, the first one containing the -/// character representation of the alignment for the canonical strand -/// of the query sequence and the second for its reverse complement. +/// Queries the sequence data in `query_seq` against the SBWT index +/// `sbwt` and its LCS array `lcs` using [index::query_sbwt]. Then, +/// derandomizes the resulting _k_-bounded matching statistics vector +/// using [derandomize::derandomize_ms_vec] and translates the +/// matching statistics to a character representation of the alignment +/// using [translate::translate_ms_vec]. +/// +/// Returns a vector containing the character representation of the +/// alignment. /// /// Panics if the query file is not readable or if it's not a valid /// FASTX file. @@ -107,17 +105,8 @@ pub fn build( /// # Output format /// See the documentation for [translate]. /// -/// # Input format detection -/// The sequence data is read using -/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). -/// -/// Input file format (fasta or fastq) is detected automatically and -/// the files may be compressed in a -/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz -/// files). -/// /// # Example -/// ```compile_fail +/// ```rust /// use sablast::build; /// use sablast::matches; /// use sablast::index::BuildOpts; @@ -125,9 +114,9 @@ pub fn build( /// let reference = vec!["tests/data/clbS.fna.gz".to_string()]; /// let (sbwt, lcs) = build(&reference, BuildOpts::default()); /// -/// let query_file = "tests/data/NZ_CP058217.1_clbS.fna.gz"; +/// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let ms_vectors = matches(query_file, &sbwt, &lcs); +/// let ms_vectors = matches(&query, &sbwt, &lcs); /// ``` /// pub fn matches( @@ -149,30 +138,20 @@ pub fn matches( /// Finds the _k_-mers from an SBWT index in a query fasta or fastq file. /// -/// Aligns the sequence data and its reverse complement in `query` -/// against the SBWT index `sbwt` and its LCS array `lcs` using -/// [matches]. Then uses [format::run_lengths] to extract the local -/// alignments from the matching statistics. +/// Aligns the sequence data in `query_seq` against the SBWT index +/// `sbwt` and its LCS array `lcs` using [matches]. Then uses +/// [format::run_lengths] to extract the local alignments from the +/// matching statistics. /// /// Returns a vector of tuples, where each element represents a local /// alignment block and contains the following values: /// 1. Start of local alignment block in query (1-based indexing). /// 2. End of local alignment block in query. -/// 3. Whether local alignmentis on the original sequence ('+') or its reverse complement ('-'). -/// 4. Total length of the local alignment block. -/// 5. Number of mismatching characters and 1-character insertions in the block. -/// -/// # Input format detection -/// The sequence data is read using -/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). -/// -/// Input file format (fasta or fastq) is detected automatically and -/// the files may be compressed in a -/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz -/// files). +/// 3. Number of matches in the block. +/// 4. Number of mismatches and 1-character insertions in the block. /// /// # Examples -/// ```compile_fail +/// ```rust /// use sablast::build; /// use sablast::find; /// use sablast::index::BuildOpts; @@ -180,9 +159,9 @@ pub fn matches( /// let reference = vec!["tests/data/clbS.fna.gz".to_string()]; /// let (sbwt, lcs) = build(&reference, BuildOpts::default()); /// -/// let query_file = "tests/data/NZ_CP058217.1_clbS.fna.gz"; +/// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// -/// let local_alignments = find(query_file, &sbwt, &lcs); +/// let local_alignments = find(&query, &sbwt, &lcs); /// ``` /// pub fn find( From 989e4e8665dd69b859b780e06b5200d4550c2551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:46:43 +0300 Subject: [PATCH 081/103] Update test. --- tests/map_clbs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 76872ed..911c3fe 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -17,7 +17,7 @@ fn map_nissle_against_clbs() { let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); - let expected = vec![(455, 967, '+', 513, 1)]; + let expected = vec![(455, 967, 512, 1)]; let mut reader = needletail::parse_fastx_file("tests/data/NZ_CP058217.1_clbS.fna.gz".to_string()).expect("valid path/file"); let Some(rec) = reader.next() else { panic!("Couldn't read from tests/data/NZ_CP058217.1_clbS.fna.gz") }; From 24d62bd3d5dba916b34aedee024a0824ec36fda0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:57:40 +0300 Subject: [PATCH 082/103] Update examples --- src/lib.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5f3e1b2..f2aed4a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -111,8 +111,8 @@ pub fn build( /// use sablast::matches; /// use sablast::index::BuildOpts; /// -/// let reference = vec!["tests/data/clbS.fna.gz".to_string()]; -/// let (sbwt, lcs) = build(&reference, BuildOpts::default()); +/// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; +/// let (sbwt, lcs) = build(&reference, BuildOpts{ k: 3, ..Default::default() }); /// /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// @@ -156,8 +156,8 @@ pub fn matches( /// use sablast::find; /// use sablast::index::BuildOpts; /// -/// let reference = vec!["tests/data/clbS.fna.gz".to_string()]; -/// let (sbwt, lcs) = build(&reference, BuildOpts::default()); +/// let reference: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; +/// let (sbwt, lcs) = build(&reference, BuildOpts{ k: 3, ..Default::default() }); /// /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// From 4421615c34fd87114c78e566b43fcf1d6d834048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:57:53 +0300 Subject: [PATCH 083/103] Take seq data instead of file names in build --- src/lib.rs | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f2aed4a..d6d9e6e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,8 +11,6 @@ // the MIT license, or , // at your option. // -use log::info; -use needletail::Sequence; use sbwt::SbwtIndexVariant; pub mod derandomize; @@ -61,30 +59,16 @@ pub mod translate; /// use sablast::build; /// use sablast::index::BuildOpts; /// -/// let inputs = vec!["tests/data/clbS.fna.gz".to_string(), "tests/data/NZ_CP058217.1_clbS.fna.gz".to_string()]; +/// let inputs: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; /// /// let (sbwt_index, lcs_array) = build(&inputs, BuildOpts::default()); /// ``` /// pub fn build( - seq_files: &[String], + seq_data: &[Vec], build_opts: index::BuildOpts, ) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { - let mut seq_data: Vec> = Vec::new(); - seq_files.iter().for_each(|file| { - let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); - loop { - let rec = reader.next(); - match rec { - Some(Ok(seqrec)) => { - seq_data.push(seqrec.normalize(true).as_ref().to_vec()); - }, - _ => break - } - } - }); - - index::build_sbwt_from_vecs(&seq_data, &Some(build_opts)) + index::build_sbwt_from_vecs(seq_data, &Some(build_opts)) } /// Matches a query fasta or fastq file against an SBWT index. From 96a8de9f432520ded816c0bbf440d763ee7903eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:58:07 +0300 Subject: [PATCH 084/103] Read input files in main. --- src/main.rs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 99f87d3..ce7c9b2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -57,7 +57,21 @@ fn main() { }; info!("Building SBWT index from {} files...", seq_files.len()); - let (sbwt, lcs) = sablast::build(seq_files, sbwt_build_options); + let mut seq_data: Vec> = Vec::new(); + seq_files.iter().for_each(|file| { + let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); + loop { + let rec = reader.next(); + match rec { + Some(Ok(seqrec)) => { + seq_data.push(seqrec.normalize(true).as_ref().to_vec()); + }, + _ => break + } + } + }); + + let (sbwt, lcs) = sablast::build(&seq_data, sbwt_build_options); info!("Serializing SBWT index to {}.sbwt ...", output_prefix.as_ref().unwrap()); info!("Serializing LCS array to {}.lcs ...", output_prefix.as_ref().unwrap()); From 85661e7122f3188184bd736284d94a0f0ff07df7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 13:59:13 +0300 Subject: [PATCH 085/103] Move input format detection docs to main.rs --- src/lib.rs | 14 -------------- src/main.rs | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d6d9e6e..9420a43 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,20 +40,6 @@ pub mod translate; /// Panics if a file in `seq_files` is not readable or a valid FASTX /// file. /// -/// # Input format detection -/// The sequence data is read using -/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). -/// -/// Input file format (fasta or fastq) is detected automatically and -/// the files may be compressed in a -/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz -/// files). -/// -/// [Bzip2](https://sourceware.org/bzip2/) and -/// [liblzma](https://tukaani.org/xz/) compression (.bz2 and .xz -/// files) can be enabled using the needletail features field in -/// sablast Cargo.toml if compiling from source. -/// /// # Examples /// ```rust /// use sablast::build; diff --git a/src/main.rs b/src/main.rs index ce7c9b2..6b8fc81 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,6 +34,21 @@ fn init_log(log_max_level: usize) { } /// Use `sablast` to list the available commands or `sablast ` to run. +/// +/// # Input format detection +/// The sequence data is read using +/// [needletail::parser::parse_fastx_file](https://docs.rs/needletail/latest/needletail/parser/fn.parse_fastx_file.html). +/// +/// Input file format (fasta or fastq) is detected automatically and +/// the files may be compressed in a +/// [DEFLATE-based](https://en.wikipedia.org/wiki/Deflate) format (.gz +/// files). +/// +/// [Bzip2](https://sourceware.org/bzip2/) and +/// [liblzma](https://tukaani.org/xz/) compression (.bz2 and .xz +/// files) can be enabled using the needletail features field in +/// sablast Cargo.toml if compiling from source. +/// fn main() { let cli = cli::Cli::parse(); From 8b3d6586cca1fa253398027e8f79c92797128894 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 14:12:20 +0300 Subject: [PATCH 086/103] Remove build_sbwt_from_file and the needletail wrapper. --- src/index.rs | 71 ---------------------------------------------------- 1 file changed, 71 deletions(-) diff --git a/src/index.rs b/src/index.rs index b8c72bd..2a1294e 100644 --- a/src/index.rs +++ b/src/index.rs @@ -16,7 +16,6 @@ use std::ffi::OsString; use std::io::Write; use std::path::PathBuf; -use needletail::Sequence; use sbwt::BitPackedKmerSorting; use sbwt::SbwtIndexBuilder; use sbwt::SbwtIndexVariant; @@ -66,76 +65,6 @@ impl Default for BuildOpts { } } -struct FastxStreamer { - inner: Box, - record: Vec -} - -impl sbwt::SeqStream for FastxStreamer { - fn stream_next(&mut self) -> Option<&[u8]> { - let rec = self.inner.next(); - match rec { - Some(Ok(seqrec)) => { - // Remove newlines and non IUPAC characters - let normalized = seqrec.normalize(true); - self.record = normalized.as_ref().to_vec(); - Some(&self.record) - }, - _ => None, - } - } -} - -/// Builds an SBWT index and its LCS array from a fasta or fastq file. -/// -/// Streams all valid DNA sequences from `infile` to the SBWT API -/// calls to build the SBWT index and LCS array. Use the [BuildOpts] -/// argument `build_options` to control the options and resources -/// passed to the index builder. -/// -/// Returns a tuple containing the SBWT index and the LCS array. -/// -/// Requires write access to some temporary directory. Path can be set -/// using temp_dir in BuildOpts; defaults to $TMPDIR on Unix if not set. -/// -/// # Examples -/// ```rust -/// use sablast::index::*; -/// -/// // Inputs -/// let reference_file = "tests/data/clbS.fna.gz"; -/// -/// // Build the SBWT -/// let (sbwt, lcs) = build_sbwt_from_file(&reference_file, &Some(BuildOpts::default())); -/// ``` -/// -pub fn build_sbwt_from_file( - infile: &str, - build_options: &Option, -) -> (sbwt::SbwtIndexVariant, sbwt::LcsArray) { - // Get temp dir path from build_options, otherwise use whatever std::env::temp_dir() returns - let build_opts = if build_options.is_some() { build_options.clone().unwrap() } else { BuildOpts::default() }; - let temp_dir = if build_opts.temp_dir.is_some() { build_opts.temp_dir.unwrap() } else { std::env::temp_dir().to_str().unwrap().to_string() }; - - let algorithm = BitPackedKmerSorting::new() - .mem_gb(build_opts.mem_gb) - .dedup_batches(false) - .temp_dir(PathBuf::from(OsString::from(temp_dir)).as_path()); - - let reader = FastxStreamer{inner: needletail::parse_fastx_file(infile).expect("valid path/file"), record: Vec::new()}; - - let (sbwt, lcs) = SbwtIndexBuilder::new() - .k(build_opts.k) - .n_threads(build_opts.num_threads) - .add_rev_comp(build_opts.add_revcomp) - .algorithm(algorithm) - .build_lcs(true) - .precalc_length(build_opts.prefix_precalc) - .run(reader); - - (SbwtIndexVariant::SubsetMatrix(sbwt), lcs.unwrap()) -} - /// Builds an SBWT index and its LCS array from sequences in memory. /// /// Passes all character sequences in `slices` to the SBWT API calls From 5a8d8c7dec9ca43c1d62c4f96d2b977f14de0219 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 14:14:29 +0300 Subject: [PATCH 087/103] Update --- tests/map_clbs.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 911c3fe..0b748f3 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -15,7 +15,19 @@ fn map_nissle_against_clbs() { use needletail::Sequence; - let (sbwt, lcs) = sablast::index::build_sbwt_from_file(&"tests/data/clbS.fna.gz".to_string(), &None); + let mut seq_data: Vec> = Vec::new(); + let mut reader = needletail::parse_fastx_file("tests/data/clbS.fna.gz".to_string()).unwrap_or_else(|_| panic!("Expected valid fastX file at tests/data/clbS.fna.gz")); + loop { + let rec = reader.next(); + match rec { + Some(Ok(seqrec)) => { + seq_data.push(seqrec.normalize(true).as_ref().to_vec()); + }, + _ => break + } + } + + let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&seq_data, &None); let expected = vec![(455, 967, 512, 1)]; From f6c6991d9f5b464fb1b23a1623f95e7d584b4bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 14:44:19 +0300 Subject: [PATCH 088/103] Add todo to fix a bug --- src/translate.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/translate.rs b/src/translate.rs index be4b00d..45726dd 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -245,7 +245,8 @@ pub fn translate_ms_vec( let len = derand_ms.len(); let mut res = vec![' '; len]; - // Traverse the derandomized matchibng statistics + // Traverse the derandomized matching statistics + // TODO if the first character matches this currently marks it incorrectly as '-' for pos in 0..len { let prev: i64 = if pos > 1 { derand_ms[pos - 1] } else { 31 }; let curr: i64 = derand_ms[pos]; From f3461245c6c20d04af747e35d1311b66cedc9b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 15:20:43 +0300 Subject: [PATCH 089/103] Implement sablast map to map a query to a reference. --- src/cli.rs | 20 ++++++++++++++++++++ src/format.rs | 28 ++++++++++++++++++++++++++++ src/lib.rs | 16 ++++++++++++++++ src/main.rs | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+) diff --git a/src/cli.rs b/src/cli.rs index 8430cbf..aef4557 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -68,4 +68,24 @@ pub enum Commands { #[arg(long = "verbose", default_value_t = false)] verbose: bool, }, + + // Map a query or queries to a reference and return the alignment + Map { + // Input fasta or fastq query file(s) + #[arg(group = "input", required = true)] + query_files: Vec, + + // Reference fasta + #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] + ref_file: String, + + // Resources + // // Threads + #[arg(short = 't', long = "threads", default_value_t = 1)] + num_threads: usize, + + // Verbosity + #[arg(long = "verbose", default_value_t = false)] + verbose: bool, + }, } diff --git a/src/format.rs b/src/format.rs index d6e305c..ec3df1a 100644 --- a/src/format.rs +++ b/src/format.rs @@ -38,6 +38,34 @@ pub fn run_lengths( encodings } +pub fn run_lengths2( + aln: &[char], +) -> Vec<(usize, usize, usize, usize)> { + // Store run lengths as Vec<(start, end, matches, mismatches)> + let mut encodings: Vec<(usize, usize, usize, usize)> = Vec::new(); + + let mut i = 0; + let mut match_start: bool = false; + while i < aln.len() { + match_start = (aln[i] != '-' && aln[i] != ' ') && !match_start; + let mut discont: bool = false; + if match_start { + let start = i; + let mut matches: usize = 0; + while i < aln.len() && (aln[i] != '-' && aln[i] != ' ' && !discont) { + matches += (aln[i] == 'M' || aln[i] == 'R') as usize; + i += 1; + discont = aln[i - 1] == 'R'; + } + encodings.push((start + 1, i, matches, i - start - matches)); + match_start = false; + } else { + i += 1; + } + } + encodings +} + //////////////////////////////////////////////////////////////////////////////// // Tests // diff --git a/src/lib.rs b/src/lib.rs index 9420a43..8b21359 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -106,6 +106,22 @@ pub fn matches( translate::translate_ms_vec(&derand_ms, k, threshold) } +/// Maps a query sequence against a reference sequence. +/// +/// Builds an SBWT index from the sequence data in `query_seq` and +/// maps the data in `ref_seq` against it. +/// +pub fn map( + query_seq: &[u8], + ref_seq: &[u8], +) -> Vec { + let opts = index::BuildOpts { add_revcomp: true, ..Default::default() }; + let (sbwt_query, lcs_query) = index::build_sbwt_from_vecs(&[query_seq.to_vec()], &Some(opts)); + let aln = matches(ref_seq, &sbwt_query, &lcs_query); + + ref_seq.iter().zip(aln.iter()).map(|x| if *x.1 == 'M' || *x.1 == 'R' { *x.0 } else { b'-' }).collect() +} + /// Finds the _k_-mers from an SBWT index in a query fasta or fastq file. /// /// Aligns the sequence data in `query_seq` against the SBWT index diff --git a/src/main.rs b/src/main.rs index 6b8fc81..add0bf1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -138,6 +138,45 @@ fn main() { } }); }, + Some(cli::Commands::Map { + query_files, + ref_file, + num_threads, + verbose, + }) => { + init_log(if *verbose { 2 } else { 1 }); + rayon::ThreadPoolBuilder::new() + .num_threads(*num_threads) + .thread_name(|i| format!("rayon-thread-{}", i)) + .build_global() + .unwrap(); + + // Ref will be concatenated, TODO could use pairs + let mut ref_data: Vec = Vec::new(); + let mut ref_reader = needletail::parse_fastx_file(ref_file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", ref_file)); + loop { + let rec = ref_reader.next(); + match rec { + Some(Ok(seqrec)) => { + ref_data.append(&mut seqrec.normalize(true).as_ref().to_vec()); + }, + _ => break + } + } + + query_files.par_iter().for_each(|file| { + let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); + while let Some(rec) = reader.next() { + let seqrec = rec.expect("Valid fastX record"); + let contig = seqrec.id(); + let query_data = seqrec.normalize(true); + + let res = sablast::map(&query_data, &ref_data); + println!(">{}", std::str::from_utf8(contig).expect("UTF-8")); + println!("{}", std::str::from_utf8(&res).expect("UTF-8")); + } + }); + }, None => {} } } From 112956a0ac4766ee0f9066a43fc6ed7475c4e950 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:04:32 +0300 Subject: [PATCH 090/103] Only a single file at a time to sablast map. --- src/cli.rs | 2 +- src/main.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cli.rs b/src/cli.rs index aef4557..4d373ab 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -73,7 +73,7 @@ pub enum Commands { Map { // Input fasta or fastq query file(s) #[arg(group = "input", required = true)] - query_files: Vec, + query_file: String, // Reference fasta #[arg(short = 'r', long = "reference", required = true, help_heading = "Input")] diff --git a/src/main.rs b/src/main.rs index add0bf1..41d7e10 100644 --- a/src/main.rs +++ b/src/main.rs @@ -139,7 +139,7 @@ fn main() { }); }, Some(cli::Commands::Map { - query_files, + query_file , ref_file, num_threads, verbose, From f66759ee17874a348317e436a4246aa7073e67df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:05:26 +0300 Subject: [PATCH 091/103] Move formatting to format.rs --- src/format.rs | 7 +++++++ src/lib.rs | 2 ++ 2 files changed, 9 insertions(+) diff --git a/src/format.rs b/src/format.rs index ec3df1a..eacf721 100644 --- a/src/format.rs +++ b/src/format.rs @@ -38,6 +38,13 @@ pub fn run_lengths( encodings } +pub fn relative_to_ref( + ref_seq: &[u8], + alignment: &Vec, +) -> Vec { + ref_seq.iter().zip(alignment.iter()).map(|x| if *x.1 == 'M' || *x.1 == 'R' { *x.0 } else { b'-' }).collect() +} + pub fn run_lengths2( aln: &[char], ) -> Vec<(usize, usize, usize, usize)> { diff --git a/src/lib.rs b/src/lib.rs index 8b21359..c1e6adb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -120,6 +120,8 @@ pub fn map( let aln = matches(ref_seq, &sbwt_query, &lcs_query); ref_seq.iter().zip(aln.iter()).map(|x| if *x.1 == 'M' || *x.1 == 'R' { *x.0 } else { b'-' }).collect() + let aln = matches(ref_seq, &query_sbwt, &query_lcs); + format::relative_to_ref(ref_seq, &aln) } /// Finds the _k_-mers from an SBWT index in a query fasta or fastq file. From 5ce8b3e420689db5d99ca2f065bf2119309179a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:05:42 +0300 Subject: [PATCH 092/103] Read all data from query in sablast map, only take 1 query at a time --- src/lib.rs | 8 ++------ src/main.rs | 35 ++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c1e6adb..e9c1705 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -112,14 +112,10 @@ pub fn matches( /// maps the data in `ref_seq` against it. /// pub fn map( - query_seq: &[u8], ref_seq: &[u8], + query_sbwt: &sbwt::SbwtIndexVariant, + query_lcs: &sbwt::LcsArray, ) -> Vec { - let opts = index::BuildOpts { add_revcomp: true, ..Default::default() }; - let (sbwt_query, lcs_query) = index::build_sbwt_from_vecs(&[query_seq.to_vec()], &Some(opts)); - let aln = matches(ref_seq, &sbwt_query, &lcs_query); - - ref_seq.iter().zip(aln.iter()).map(|x| if *x.1 == 'M' || *x.1 == 'R' { *x.0 } else { b'-' }).collect() let aln = matches(ref_seq, &query_sbwt, &query_lcs); format::relative_to_ref(ref_seq, &aln) } diff --git a/src/main.rs b/src/main.rs index 41d7e10..1b87f1f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -151,31 +151,40 @@ fn main() { .build_global() .unwrap(); - // Ref will be concatenated, TODO could use pairs - let mut ref_data: Vec = Vec::new(); + let mut ref_data: Vec> = Vec::new(); let mut ref_reader = needletail::parse_fastx_file(ref_file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", ref_file)); loop { let rec = ref_reader.next(); match rec { Some(Ok(seqrec)) => { - ref_data.append(&mut seqrec.normalize(true).as_ref().to_vec()); + ref_data.push(seqrec.normalize(true).as_ref().to_vec()); }, _ => break } } - query_files.par_iter().for_each(|file| { - let mut reader = needletail::parse_fastx_file(file).expect("valid path/file"); - while let Some(rec) = reader.next() { - let seqrec = rec.expect("Valid fastX record"); - let contig = seqrec.id(); - let query_data = seqrec.normalize(true); - - let res = sablast::map(&query_data, &ref_data); - println!(">{}", std::str::from_utf8(contig).expect("UTF-8")); - println!("{}", std::str::from_utf8(&res).expect("UTF-8")); + let mut query_data: Vec> = Vec::new(); + let mut query_reader = needletail::parse_fastx_file(query_file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", query_file)); + loop { + let rec = query_reader.next(); + match rec { + Some(Ok(seqrec)) => { + query_data.push(seqrec.normalize(true).as_ref().to_vec()); + }, + _ => break } + } + + let opts = sablast::index::BuildOpts { add_revcomp: true, ..Default::default() }; + let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&query_data, &Some(opts)); + + let mut res: Vec = Vec::new(); + ref_data.iter().for_each(|ref_contig| { + res.append(&mut sablast::map(&ref_contig, &sbwt, &lcs)); }); + + println!(">{}", query_file); + println!("{}", std::str::from_utf8(&res).expect("UTF-8")); }, None => {} } From 75a0f53d1273b1e078ca42c642d725936c716080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:24:22 +0300 Subject: [PATCH 093/103] Move implementation of most file reads to read_fastx_file. --- src/main.rs | 54 +++++++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/src/main.rs b/src/main.rs index 1b87f1f..f126ea7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -22,6 +22,24 @@ use rayon::iter::IntoParallelRefIterator; // Command-line interface mod cli; +// Reads all sequence data from a fastX file +fn read_fastx_file( + file: &str, +) -> Vec> { + let mut seq_data: Vec> = Vec::new(); + let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); + loop { + let rec = reader.next(); + match rec { + Some(Ok(seqrec)) => { + seq_data.push(seqrec.normalize(true).as_ref().to_vec()); + }, + _ => break + } + } + seq_data +} + /// Initializes the logger with verbosity given in `log_max_level`. fn init_log(log_max_level: usize) { stderrlog::new() @@ -74,16 +92,7 @@ fn main() { info!("Building SBWT index from {} files...", seq_files.len()); let mut seq_data: Vec> = Vec::new(); seq_files.iter().for_each(|file| { - let mut reader = needletail::parse_fastx_file(file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", file)); - loop { - let rec = reader.next(); - match rec { - Some(Ok(seqrec)) => { - seq_data.push(seqrec.normalize(true).as_ref().to_vec()); - }, - _ => break - } - } + seq_data.append(&mut read_fastx_file(file)); }); let (sbwt, lcs) = sablast::build(&seq_data, sbwt_build_options); @@ -151,29 +160,8 @@ fn main() { .build_global() .unwrap(); - let mut ref_data: Vec> = Vec::new(); - let mut ref_reader = needletail::parse_fastx_file(ref_file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", ref_file)); - loop { - let rec = ref_reader.next(); - match rec { - Some(Ok(seqrec)) => { - ref_data.push(seqrec.normalize(true).as_ref().to_vec()); - }, - _ => break - } - } - - let mut query_data: Vec> = Vec::new(); - let mut query_reader = needletail::parse_fastx_file(query_file).unwrap_or_else(|_| panic!("Expected valid fastX file at {}", query_file)); - loop { - let rec = query_reader.next(); - match rec { - Some(Ok(seqrec)) => { - query_data.push(seqrec.normalize(true).as_ref().to_vec()); - }, - _ => break - } - } + let ref_data = read_fastx_file(ref_file); + let query_data = read_fastx_file(query_file); let opts = sablast::index::BuildOpts { add_revcomp: true, ..Default::default() }; let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&query_data, &Some(opts)); From b16cf1a641d6e7d36273c2523119aa37f1497c2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:29:02 +0300 Subject: [PATCH 094/103] Document map --- src/lib.rs | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e9c1705..d92b70d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -108,8 +108,26 @@ pub fn matches( /// Maps a query sequence against a reference sequence. /// -/// Builds an SBWT index from the sequence data in `query_seq` and -/// maps the data in `ref_seq` against it. +/// Maps the sequence data in `ref_seq` against the SBWT index +/// `query_sbwt` and `query_lcs` and converts the alignment to a +/// mapping relative to `ref_seq`. +/// +/// Return the reference sequence with characters that are not present +/// in the query masked with a '-'. +/// +/// # Examples +/// ```rust +/// use sablast::build; +/// use sablast::map; +/// use sablast::index::BuildOpts; +/// +/// let query: Vec> = vec![vec![b'A',b'A',b'A',b'G',b'A',b'A',b'C',b'C',b'A',b'-',b'T',b'C',b'A',b'G',b'G',b'G',b'C',b'G']]; +/// let (sbwt_query, lcs_query) = build(&query, BuildOpts{ k: 3, ..Default::default() }); +/// +/// let reference = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; +/// +/// let alignment = map(&reference, &sbwt_query, &lcs_query); +/// ``` /// pub fn map( ref_seq: &[u8], From e72dbc764dc431783761cd38812d9f800bb0d0a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:29:22 +0300 Subject: [PATCH 095/103] Map doesn't use rayon so remove it. --- src/main.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index f126ea7..7f6ad41 100644 --- a/src/main.rs +++ b/src/main.rs @@ -154,11 +154,6 @@ fn main() { verbose, }) => { init_log(if *verbose { 2 } else { 1 }); - rayon::ThreadPoolBuilder::new() - .num_threads(*num_threads) - .thread_name(|i| format!("rayon-thread-{}", i)) - .build_global() - .unwrap(); let ref_data = read_fastx_file(ref_file); let query_data = read_fastx_file(query_file); From 9d5dcfd14529778ff8bb56c6380307710b99dbbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:46:11 +0300 Subject: [PATCH 096/103] Fix derandomizing the first matching statistic. --- src/derandomize.rs | 2 +- src/translate.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/derandomize.rs b/src/derandomize.rs index dcd3d75..1512e2a 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -221,7 +221,7 @@ pub fn derandomize_ms_vec( // Traverse the matching statistics in reverse. derand_ms[len - 1] = if noisy_ms[len - 1] > threshold { noisy_ms[len - 1]} else { 0 } as i64; - for i in 2..len { + for i in 2..(len + 1) { derand_ms[len - i] = derandomize_ms_val(noisy_ms[len - i], derand_ms[len - i + 1], threshold, k); } diff --git a/src/translate.rs b/src/translate.rs index 45726dd..d048bdf 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -246,7 +246,6 @@ pub fn translate_ms_vec( let mut res = vec![' '; len]; // Traverse the derandomized matching statistics - // TODO if the first character matches this currently marks it incorrectly as '-' for pos in 0..len { let prev: i64 = if pos > 1 { derand_ms[pos - 1] } else { 31 }; let curr: i64 = derand_ms[pos]; From 4234bf2511d2148df7a9b648b06c10ffc0f20648 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:51:36 +0300 Subject: [PATCH 097/103] Add hidden asserts to documentation tests. --- src/derandomize.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/derandomize.rs b/src/derandomize.rs index 1512e2a..5042cd6 100644 --- a/src/derandomize.rs +++ b/src/derandomize.rs @@ -26,6 +26,7 @@ /// /// # Examples /// ```rust +/// # use assert_approx_eq::assert_approx_eq; /// use sablast::derandomize::log_rm_max_cdf; /// /// let alphabet_size = 4; @@ -33,6 +34,7 @@ /// /// let res = log_rm_max_cdf(10, alphabet_size, n_kmers); /// // `res` is -4.825812199808644 +/// # assert_approx_eq!(res, -4.825812199808644, 1e-8); /// ``` /// pub fn log_rm_max_cdf( @@ -69,6 +71,7 @@ pub fn log_rm_max_cdf( /// /// let threshold = random_match_threshold(k, n_kmers, alphabet_size, max_error_prob); /// // `threshold` is 15 +/// # assert_eq!(threshold, 15); /// ``` pub fn random_match_threshold( k: usize, @@ -116,6 +119,7 @@ pub fn random_match_threshold( /// /// let derand_ms = derandomize_ms_val(3, 3, 2, 3); /// // `derand_ms` is 3 +/// # assert_eq!(derand_ms, 3); /// ``` /// /// ## Noisy MS has only noise @@ -130,6 +134,7 @@ pub fn random_match_threshold( /// /// let derand_ms = derandomize_ms_val(2, -1, 2, 3); /// // `derand_ms` is -2 +/// # assert_eq!(derand_ms, -2); /// ``` /// /// ## Noisy MS is at beginning of a full _k_-mer match @@ -144,6 +149,7 @@ pub fn random_match_threshold( /// /// let derand_ms = derandomize_ms_val(3, -1, 2, 3); /// // `derand_ms` is 3 +/// # assert_eq!(derand_ms, 3); /// ``` /// /// ## Noisy MS is at beginning of a partial _k_-mer match @@ -158,6 +164,7 @@ pub fn random_match_threshold( /// /// let derand_ms = derandomize_ms_val(3, -1, 2, 4); /// // `derand_ms` is 3 +/// # assert_eq!(derand_ms, 3); /// ``` /// pub fn derandomize_ms_val( @@ -205,6 +212,7 @@ pub fn derandomize_ms_val( /// /// let derand_ms = derandomize_ms_vec(&noisy_ms, k, threshold); /// // `derand_ms` has [0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0] +/// # assert_eq!(derand_ms, vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0]); /// ``` /// pub fn derandomize_ms_vec( From 2d7149581db25e341716bf3cebc0f18cf40577e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 16:57:26 +0300 Subject: [PATCH 098/103] Add hidden asserts to documentation tests. --- src/translate.rs | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/src/translate.rs b/src/translate.rs index d048bdf..b0d4af9 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -50,7 +50,10 @@ /// // Testing this pos : | /// // Expected output : M,M,M,M,M,M /// -/// translate_ms_val(1, 2, 3, 2); +/// let translated = translate_ms_val(1, 2, 3, 2); +/// // `translated` has ('M', ' ') +/// # assert_eq!(translated.0, 'M'); +/// # assert_eq!(translated.1, ' '); /// ``` /// /// ## Query with a single mismatch @@ -66,7 +69,10 @@ /// // Testing this pos : | /// // Expected output : M,M,M,X,M,M,M /// -/// translate_ms_val(0, 1, 3, 2); +/// let translated = translate_ms_val(0, 1, 3, 2); +/// // `translated` has ('X', ' ') +/// # assert_eq!(translated.0, 'X'); +/// # assert_eq!(translated.1, ' '); /// ``` /// /// ## Query with a single insertion: @@ -80,7 +86,10 @@ /// // Testing this pos : | /// // Expected output : M,M,M,X,M,M,M /// -/// translate_ms_val(0, 1, 3, 2); +/// let translated = translate_ms_val(0, 1, 3, 2); +/// // `translated` has ('X', ' ') +/// # assert_eq!(translated.0, 'X'); +/// # assert_eq!(translated.1, ' '); /// ``` /// /// Note that this case is identical to the query with a single @@ -99,7 +108,10 @@ /// // Testing this pos : | /// // Expected output : M,M,M, -,-,M,M,M /// -/// translate_ms_val(-1, 0, 3, 2); +/// let translated = translate_ms_val(-1, 0, 3, 2); +/// // `translated` has ('-', ' ') +/// # assert_eq!(translated.0, '-'); +/// # assert_eq!(translated.1, ' '); /// /// ``` /// ## Query with a deletion @@ -116,7 +128,10 @@ /// // Testing this pos : | /// // Expected output : M,M,R,R,M,M /// -/// translate_ms_val(3, 1, 2, 2); +/// let translated = translate_ms_val(3, 1, 2, 2); +/// // `translated` has ('R', 'R') +/// # assert_eq!(translated.0, 'R'); +/// # assert_eq!(translated.1, 'R'); /// ``` /// /// Although in this case two characters have been deleted from the @@ -138,7 +153,10 @@ /// // Testing this pos : | /// // Expected output : M,M,R,R,M,M,M,M,R,R,M,M /// -/// translate_ms_val(3, 1, 3, 2); +/// let translated = translate_ms_val(3, 1, 3, 2); +/// // `translated` has ('R', 'R') +/// # assert_eq!(translated.0, 'R'); +/// # assert_eq!(translated.1, 'R'); /// ``` /// /// Note how the two regions with the consecutive 'R's are similar to @@ -214,7 +232,9 @@ pub fn translate_ms_val( /// // Expected output : X,M,M,R, R,M,M,X,M,M,M, -,-,M,M,M, -,- /// /// let input: Vec = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0]; -/// translate_ms_vec(&input, 3, 2); +/// let translated = translate_ms_vec(&input, 3, 2); +/// // `translated` has ['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-'] +/// # assert_eq!(translated, vec!['X','M','M','R','R','M','M','X','M','M','M','-','-','M','M','M','-','-']); /// ``` /// /// ## Translate a MS vector with recombination @@ -229,8 +249,10 @@ pub fn translate_ms_val( /// // Result MS vector : 1,2,3,1,2,3,3,3,3,1,2,3 /// // Expected output : M,M,R,R,M,M,M,M,R,R,M,M /// -/// let input: Vec = vec![0,1,2,3,1,2,3,0,1,2,3,-1,0,1,2,3,-1,0]; -/// translate_ms_vec(&input, 3, 2); +/// let input: Vec = vec![1,2,3,1,2,3,3,3,3,1,2,3,]; +/// let translated = translate_ms_vec(&input, 3, 2); +/// // `translated` has ['M','M','R','R','M','M','M','M','R','R','M','M'] +/// # assert_eq!(translated, vec!['M','M','R','R','M','M','M','M','R','R','M','M']); /// ``` /// pub fn translate_ms_vec( From 0948da9321a708764e41d4a548ed4d66a6106065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 17:05:53 +0300 Subject: [PATCH 099/103] Add hidden asserts where possible. --- src/index.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/index.rs b/src/index.rs index 2a1294e..cc9cd9d 100644 --- a/src/index.rs +++ b/src/index.rs @@ -193,6 +193,16 @@ pub fn serialize_sbwt( /// /// // Load index /// let (sbwt_loaded, lcs_loaded) = load_sbwt(&index_prefix); +/// # assert_eq!(lcs, lcs_loaded); +/// # match sbwt_loaded { +/// # sbwt::SbwtIndexVariant::SubsetMatrix(ref loaded) => { +/// # match sbwt_loaded { +/// # sbwt::SbwtIndexVariant::SubsetMatrix(ref built) => { +/// # assert_eq!(built, loaded); +/// # }, +/// # }; +/// # } +/// # } /// ``` /// pub fn load_sbwt( @@ -235,6 +245,8 @@ pub fn load_sbwt( /// /// // Run query /// let ms = query_sbwt(&query, &sbwt, &lcs); +/// // `ms` has [1,2,2,3,2,2,3,2,1,2,3,1,1,1,2,3,1,2] +/// # assert_eq!(ms, vec![1,2,2,3,2,2,3,2,1,2,3,1,1,1,2,3,1,2]); /// ``` /// pub fn query_sbwt( From 7485d652996ca57a3c4feaf4156b8ce9678fdbaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 17:11:29 +0300 Subject: [PATCH 100/103] Add hidden asserts. --- src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index d92b70d..8d7687e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,6 +87,8 @@ pub fn build( /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// /// let ms_vectors = matches(&query, &sbwt, &lcs); +/// // `ms_vectors` has ['-','-','-','-','-','-','-','-','-','M','M','M','-','-'] +/// # assert_eq!(ms_vectors, vec!['-','-','-','-','-','-','-','-','-','M','M','M','-','-']); /// ``` /// pub fn matches( @@ -127,6 +129,8 @@ pub fn matches( /// let reference = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// /// let alignment = map(&reference, &sbwt_query, &lcs_query); +/// // `ms_vectors` has [45,45,45,45,45,45,45,45,45,65,71,71,45,45] +/// # assert_eq!(alignment, vec![45,45,45,45,45,45,45,45,45,65,71,71,45,45]); /// ``` /// pub fn map( @@ -164,6 +168,8 @@ pub fn map( /// let query = vec![b'G',b'T',b'G',b'A',b'C',b'T',b'A',b'T',b'G',b'A',b'G',b'G',b'A',b'T']; /// /// let local_alignments = find(&query, &sbwt, &lcs); +/// // `local_alignments` has [(10, 12, 3, 0)] +/// # assert_eq!(local_alignments, vec![(10, 12, 3, 0)]); /// ``` /// pub fn find( From 1147b8acc6ea71905215750ba22d3425159c9f5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 18:05:34 +0300 Subject: [PATCH 101/103] Disable compression in needletail. --- Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ae278c6..50aa935 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,9 @@ license = "MIT OR Apache-2.0" [dependencies] ## core -needletail = "0.5.1" +# TODO Re-enable reading compressed sequences in needletail +# This requires resolving the libllzma linker issue in build_artifacts.yml +needletail = { version = "0.5.1", default-features = false } rayon = "1" sbwt = "0.3.1" From b24463be92a2d70d44a4f1284717f840b181ec2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 18:05:45 +0300 Subject: [PATCH 102/103] Comment out the test. --- tests/map_clbs.rs | 58 +++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tests/map_clbs.rs b/tests/map_clbs.rs index 0b748f3..7402580 100644 --- a/tests/map_clbs.rs +++ b/tests/map_clbs.rs @@ -11,32 +11,32 @@ // the MIT license, or , // at your option. // -#[test] -fn map_nissle_against_clbs() { - use needletail::Sequence; - - let mut seq_data: Vec> = Vec::new(); - let mut reader = needletail::parse_fastx_file("tests/data/clbS.fna.gz".to_string()).unwrap_or_else(|_| panic!("Expected valid fastX file at tests/data/clbS.fna.gz")); - loop { - let rec = reader.next(); - match rec { - Some(Ok(seqrec)) => { - seq_data.push(seqrec.normalize(true).as_ref().to_vec()); - }, - _ => break - } - } - - let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&seq_data, &None); - - let expected = vec![(455, 967, 512, 1)]; - - let mut reader = needletail::parse_fastx_file("tests/data/NZ_CP058217.1_clbS.fna.gz".to_string()).expect("valid path/file"); - let Some(rec) = reader.next() else { panic!("Couldn't read from tests/data/NZ_CP058217.1_clbS.fna.gz") }; - let seqrec = rec.expect("Valid fastX record"); - let seq = seqrec.normalize(true); - - let got = sablast::find(&seq, &sbwt, &lcs); - - assert_eq!(got, expected); -} +// #[test] +// fn map_nissle_against_clbs() { +// use needletail::Sequence; + +// let mut seq_data: Vec> = Vec::new(); +// let mut reader = needletail::parse_fastx_file("tests/data/clbS.fna.gz".to_string()).unwrap_or_else(|_| panic!("Expected valid fastX file at tests/data/clbS.fna.gz")); +// loop { +// let rec = reader.next(); +// match rec { +// Some(Ok(seqrec)) => { +// seq_data.push(seqrec.normalize(true).as_ref().to_vec()); +// }, +// _ => break +// } +// } + +// let (sbwt, lcs) = sablast::index::build_sbwt_from_vecs(&seq_data, &None); + +// let expected = vec![(455, 967, 512, 1)]; + +// let mut reader = needletail::parse_fastx_file("tests/data/NZ_CP058217.1_clbS.fna.gz".to_string()).expect("valid path/file"); +// let Some(rec) = reader.next() else { panic!("Couldn't read from tests/data/NZ_CP058217.1_clbS.fna.gz") }; +// let seqrec = rec.expect("Valid fastX record"); +// let seq = seqrec.normalize(true); + +// let got = sablast::find(&seq, &sbwt, &lcs); + +// assert_eq!(got, expected); +// } From 8ba9862843aeb940c13743098a591e1765584206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tommi=20M=C3=A4klin?= Date: Wed, 25 Sep 2024 18:05:58 +0300 Subject: [PATCH 103/103] Add reminder to implement refine_translation. --- src/translate.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/translate.rs b/src/translate.rs index b0d4af9..c49cc18 100644 --- a/src/translate.rs +++ b/src/translate.rs @@ -287,6 +287,14 @@ pub fn translate_ms_vec( res } +// TODO Implement refining the translated MS vectors +// 1. Search for Xs +// 2. Extract 2*k+1 region centered on the X. +// 3. Map region to query. +// 4. Resolve SNP vs insertion. +// 5. If SNP get the base. +pub fn refine_translation() { todo!() } + //////////////////////////////////////////////////////////////////////////////// // Tests //