From 2f4c623653091e5488db26b61211d553cab6ac45 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 6 Sep 2023 15:55:49 +0200 Subject: [PATCH 1/6] fix: return best alignment when band extension hits max_band_limit --- packages_rs/nextclade/src/align/align.rs | 33 ++++++++++++------- .../nextclade/src/align/seed_alignment.rs | 11 ++----- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index 705f295a3..c2b2b849b 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -1,3 +1,5 @@ +use std::cmp::max; + use crate::align::backtrace::{backtrace, AlignmentOutput}; use crate::align::band_2d::Stripe; use crate::align::band_2d::{full_matrix, simple_stripes}; @@ -9,6 +11,7 @@ use crate::alphabet::aa::Aa; use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; use crate::make_error; +use bio::alignment; use eyre::{Report, WrapErr}; use log::{info, trace}; @@ -62,28 +65,36 @@ pub fn align_nuc( let mut terminal_bandwidth = params.terminal_bandwidth as isize; let mut excess_bandwidth = params.excess_bandwidth as isize; - let mut allowed_mismatches = params.allowed_mismatches as isize; - + let mut minimal_bandwidth = max(1, params.allowed_mismatches as isize); + let max_band_area = params.max_band_area; let mut attempt = 0; + let mut alignment: AlignmentOutput; loop { - let stripes = create_alignment_band( + let (stripes, band_area) = create_alignment_band( &seed_matches, qry_len as isize, ref_len as isize, terminal_bandwidth, excess_bandwidth, - allowed_mismatches, - params.max_band_area, - )?; + minimal_bandwidth, + ); + if band_area > max_band_area { + if attempt == 0 { + return make_error!("Alignment matrix size {band_area} exceeds maximum value {max_band_area}. The threshold can be adjusted using CLI flag '--max-band-area' or using 'maxBandArea' field in the dataset's virus_properties.json"); + } + // return previously calculated alignment + return Ok(alignment); + } - let mut alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); + alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); alignment.is_reverse_complement = is_reverse_complement; - if alignment.hit_boundary { - terminal_bandwidth *= 2; - excess_bandwidth *= 2; - allowed_mismatches *= 2; + if alignment.hit_boundary && minimal_bandwidth > 0 { + // double bandwidth parameters or increase to one if 0 + terminal_bandwidth = max(2 * terminal_bandwidth, 1); + excess_bandwidth = max(2 * excess_bandwidth, 1); + minimal_bandwidth = max(2 * minimal_bandwidth, 1); attempt += 1; info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {}", attempt, alignment.alignment_score); } else { diff --git a/packages_rs/nextclade/src/align/seed_alignment.rs b/packages_rs/nextclade/src/align/seed_alignment.rs index d3877e581..810bd1fe0 100644 --- a/packages_rs/nextclade/src/align/seed_alignment.rs +++ b/packages_rs/nextclade/src/align/seed_alignment.rs @@ -215,8 +215,7 @@ pub fn create_alignment_band( terminal_bandwidth: isize, excess_bandwidth: isize, minimal_bandwidth: isize, - max_band_area: usize, -) -> Result, Report> { +) -> (Vec, usize) { // This function steps through the chained seeds and determines and appropriate band // defined via stripes in query coordinates. These bands will later be chopped to reachable ranges @@ -306,13 +305,7 @@ pub fn create_alignment_band( // write_stripes_to_file(&stripes, "stripes.csv"); // trim stripes to reachable regions - let (regularized_stripes, band_area) = regularize_stripes(stripes, qry_len as usize); - - if band_area > max_band_area { - return make_error!("Alignment matrix size {band_area} exceeds maximum value {max_band_area}. The threshold can be adjusted using CLI flag '--max-band-area' or using 'maxBandArea' field in the dataset's virus_properties.json"); - } - - Ok(regularized_stripes) + regularize_stripes(stripes, qry_len as usize) } #[derive(Clone, Copy, Debug)] From 75d4038efc31bfce429bf63e64bc0eebaaff769f Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 6 Sep 2023 15:58:12 +0200 Subject: [PATCH 2/6] chore: remove unused import --- packages_rs/nextclade/src/align/align.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index c2b2b849b..d470fe084 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -1,5 +1,3 @@ -use std::cmp::max; - use crate::align::backtrace::{backtrace, AlignmentOutput}; use crate::align::band_2d::Stripe; use crate::align::band_2d::{full_matrix, simple_stripes}; @@ -11,9 +9,9 @@ use crate::alphabet::aa::Aa; use crate::alphabet::letter::Letter; use crate::alphabet::nuc::Nuc; use crate::make_error; -use bio::alignment; use eyre::{Report, WrapErr}; use log::{info, trace}; +use std::cmp::max; fn align_pairwise>( qry_seq: &[T], From b318dcc0ddde6818a32907850f5e2817ff654873 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 6 Sep 2023 17:15:40 +0200 Subject: [PATCH 3/6] chore: remove unused argument in tests --- packages_rs/nextclade/src/align/seed_alignment.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages_rs/nextclade/src/align/seed_alignment.rs b/packages_rs/nextclade/src/align/seed_alignment.rs index 810bd1fe0..5c8ee9e61 100644 --- a/packages_rs/nextclade/src/align/seed_alignment.rs +++ b/packages_rs/nextclade/src/align/seed_alignment.rs @@ -413,7 +413,6 @@ mod tests { let max_indel = 100; let qry_len = 30; let ref_len = 40; - let max_band_area = 500_000_000; let result = create_alignment_band( &seed_matches, @@ -422,7 +421,6 @@ mod tests { terminal_bandwidth, excess_bandwidth, allowed_mismatches, - max_band_area, ); Ok(()) From 7ef77d736d782ae70e24c688f2fede89fe3e546a Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 6 Sep 2023 17:25:07 +0200 Subject: [PATCH 4/6] fix: avoid uninitialized alignment --- packages_rs/nextclade/src/align/align.rs | 55 ++++++++++++------------ 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index d470fe084..5c21a1141 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -66,9 +66,28 @@ pub fn align_nuc( let mut minimal_bandwidth = max(1, params.allowed_mismatches as isize); let max_band_area = params.max_band_area; let mut attempt = 0; - let mut alignment: AlignmentOutput; - loop { + let (stripes, band_area) = create_alignment_band( + &seed_matches, + qry_len as isize, + ref_len as isize, + terminal_bandwidth, + excess_bandwidth, + minimal_bandwidth, + ); + if band_area > max_band_area { + return make_error!("Alignment matrix size {band_area} exceeds maximum value {max_band_area}. The threshold can be adjusted using CLI flag '--max-band-area' or using 'maxBandArea' field in the dataset's virus_properties.json"); + } + + let mut alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); + + while alignment.hit_boundary && attempt < params.max_alignment_attempts { + // double bandwidth parameters or increase to one if 0 + terminal_bandwidth = max(2 * terminal_bandwidth, 1); + excess_bandwidth = max(2 * excess_bandwidth, 1); + minimal_bandwidth = max(2 * minimal_bandwidth, 1); + attempt += 1; + // make new band let (stripes, band_area) = create_alignment_band( &seed_matches, qry_len as isize, @@ -77,36 +96,16 @@ pub fn align_nuc( excess_bandwidth, minimal_bandwidth, ); + // discard stripes and break to return previous alignment if band_area > max_band_area { - if attempt == 0 { - return make_error!("Alignment matrix size {band_area} exceeds maximum value {max_band_area}. The threshold can be adjusted using CLI flag '--max-band-area' or using 'maxBandArea' field in the dataset's virus_properties.json"); - } - // return previously calculated alignment - return Ok(alignment); + break; } - + // realign alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); - alignment.is_reverse_complement = is_reverse_complement; - - if alignment.hit_boundary && minimal_bandwidth > 0 { - // double bandwidth parameters or increase to one if 0 - terminal_bandwidth = max(2 * terminal_bandwidth, 1); - excess_bandwidth = max(2 * excess_bandwidth, 1); - minimal_bandwidth = max(2 * minimal_bandwidth, 1); - attempt += 1; - info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {}", attempt, alignment.alignment_score); - } else { - if attempt > 0 { - info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {}", attempt+1, alignment.alignment_score); - } - return Ok(alignment); - } - - if attempt > params.max_alignment_attempts { - info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Alignment score was: {}", alignment.alignment_score); - return Ok(alignment); - } } + + alignment.is_reverse_complement = is_reverse_complement; + Ok(alignment) } /// align amino acids using a fixed bandwidth banded alignment while penalizing terminal indels From bd074789f06f1222368fa5ce399ab09d4ce97ef0 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Wed, 6 Sep 2023 18:06:28 +0200 Subject: [PATCH 5/6] chore: add log messages --- packages_rs/nextclade/src/align/align.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index 5c21a1141..fbdc72d07 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -82,6 +82,7 @@ pub fn align_nuc( let mut alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); while alignment.hit_boundary && attempt < params.max_alignment_attempts { + info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Band boundary is hit on attempt {}. Retrying with relaxed parameters. Alignment score was: {}", attempt+1, alignment.alignment_score); // double bandwidth parameters or increase to one if 0 terminal_bandwidth = max(2 * terminal_bandwidth, 1); excess_bandwidth = max(2 * excess_bandwidth, 1); @@ -103,7 +104,17 @@ pub fn align_nuc( // realign alignment = align_pairwise(&qry_seq, ref_seq, gap_open_close, params, &stripes); } - + // report success/failure of broadening of band width + if alignment.hit_boundary { + info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Attempted to relax band parameters {attempt} times, but still hitting the band boundary. Returning last attempt with score: {}", alignment.alignment_score); + if band_area > max_band_area { + info!( + "When processing sequence #{index} '{seq_name}': final band area {band_area} exceeded the cutoff {max_band_area}" + ); + } + } else if attempt > 0 { + info!("When processing sequence #{index} '{seq_name}': In nucleotide alignment: Succeeded without hitting band boundary on attempt {}. Alignment score was: {}", attempt+1, alignment.alignment_score); + } alignment.is_reverse_complement = is_reverse_complement; Ok(alignment) } From 867a0e66f99ff5987a3639b4a4bdd478963fac9d Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Thu, 7 Sep 2023 09:40:02 +0200 Subject: [PATCH 6/6] fix: reassign rather than redefine band_area in attempt loop to allow reporting after the loop --- packages_rs/nextclade/src/align/align.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages_rs/nextclade/src/align/align.rs b/packages_rs/nextclade/src/align/align.rs index fbdc72d07..37e97bfa2 100644 --- a/packages_rs/nextclade/src/align/align.rs +++ b/packages_rs/nextclade/src/align/align.rs @@ -67,7 +67,7 @@ pub fn align_nuc( let max_band_area = params.max_band_area; let mut attempt = 0; - let (stripes, band_area) = create_alignment_band( + let (mut stripes, mut band_area) = create_alignment_band( &seed_matches, qry_len as isize, ref_len as isize, @@ -89,7 +89,7 @@ pub fn align_nuc( minimal_bandwidth = max(2 * minimal_bandwidth, 1); attempt += 1; // make new band - let (stripes, band_area) = create_alignment_band( + (stripes, band_area) = create_alignment_band( &seed_matches, qry_len as isize, ref_len as isize,