Skip to content

Commit

Permalink
Merge pull request #1251 from nextstrain/fix/ingore-non-acgt-in-tree-ops
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov authored Sep 12, 2023
2 parents 52dd4f8 + b642cc7 commit 894409f
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 22 deletions.
16 changes: 16 additions & 0 deletions packages_rs/nextclade/src/analyze/divergence.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use crate::analyze::nuc_sub::NucSub;
use crate::coord::range::NucRefGlobalRange;
use crate::tree::params::TreeBuilderParams;
use crate::tree::tree::DivergenceUnits;

/// Calculate number of nuc muts, only considering ACGT characters
Expand All @@ -24,3 +26,17 @@ pub fn calculate_branch_length(

this_div
}

/// Calculate nuc mut score
pub fn score_nuc_muts(nuc_muts: &[NucSub], masked_ranges: &[NucRefGlobalRange], params: &TreeBuilderParams) -> f64 {
// Only consider ACGT characters
let nuc_muts = nuc_muts.iter().filter(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt());

// Split away masked mutations
let (masked_muts, muts): (Vec<_>, Vec<_>) =
nuc_muts.partition(|m| masked_ranges.iter().any(|range| range.contains(m.pos)));

let n_muts = muts.len() as f64;
let n_masked_muts = masked_muts.len() as f64;
n_muts + n_masked_muts * params.masked_muts_weight
}
2 changes: 1 addition & 1 deletion packages_rs/nextclade/src/run/nextclade_wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ impl Nextclade {

pub fn get_output_trees(&mut self, results: Vec<NextcladeOutputs>) -> Result<Option<OutputTrees>, Report> {
if let Some(graph) = &mut self.graph {
graph_attach_new_nodes_in_place(graph, results, self.ref_seq.len(), &self.params.tree_builder)?;
graph_attach_new_nodes_in_place(graph, results, self.ref_seq.len(), &self.params.tree_builder)?;
let auspice = convert_graph_to_auspice_tree(graph)?;
let nwk = convert_graph_to_nwk_string(graph)?;
Ok(Some(OutputTrees { auspice, nwk }))
Expand Down
4 changes: 4 additions & 0 deletions packages_rs/nextclade/src/tree/params.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@ pub struct TreeBuilderParams {
#[clap(long)]
#[clap(num_args=0..=1, default_missing_value = "true")]
pub without_greedy_tree_builder: bool,

#[clap(long)]
pub masked_muts_weight: f64,
}

#[allow(clippy::derivable_impls)]
impl Default for TreeBuilderParams {
fn default() -> Self {
Self {
without_greedy_tree_builder: false,
masked_muts_weight: 0.05,
}
}
}
48 changes: 27 additions & 21 deletions packages_rs/nextclade/src/tree/tree_builder.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
use crate::analyze::aa_del::AaDel;
use crate::analyze::aa_sub::AaSub;
use crate::analyze::divergence::{calculate_branch_length, count_nuc_muts};
use crate::analyze::divergence::{calculate_branch_length, count_nuc_muts, score_nuc_muts};
use crate::analyze::find_private_nuc_mutations::BranchMutations;
use crate::analyze::nuc_del::NucDel;
use crate::analyze::nuc_sub::NucSub;
use crate::coord::range::NucRefGlobalRange;
use crate::graph::node::{GraphNodeKey, Node};
use crate::tree::params::TreeBuilderParams;
use crate::tree::split_muts::{difference_of_muts, split_muts, union_of_muts, SplitMutsResult};
Expand Down Expand Up @@ -86,7 +87,7 @@ pub fn graph_attach_new_node_in_place(
} else {
// for the attachment on the reference tree ('result') fine tune the position
// on the updated graph to minimize the number of private mutations
finetune_nearest_node(graph, result.nearest_node_id, &mutations_seq)?
finetune_nearest_node(graph, result.nearest_node_id, &mutations_seq, params)?
};

// add the new node at the fine tuned position while accounting for shared mutations
Expand All @@ -106,22 +107,25 @@ pub fn finetune_nearest_node(
graph: &AuspiceGraph,
nearest_node_key: GraphNodeKey,
seq_private_mutations: &BranchMutations,
params: &TreeBuilderParams,
) -> Result<(GraphNodeKey, BranchMutations), Report> {
let masked_ranges = graph.data.meta.placement_mask_ranges();
let mut best_node = graph.get_node(nearest_node_key)?;
let mut private_mutations = seq_private_mutations.clone();

loop {
// Check how many mutations are shared with the branch leading to the current_best_node or any of its children
let (candidate_node, candidate_split, n_shared_muts) = find_shared_muts(graph, best_node, &private_mutations)
.wrap_err_with(|| {
let (candidate_node, candidate_split, shared_muts_score) =
find_shared_muts(graph, best_node, &private_mutations, masked_ranges, params).wrap_err_with(|| {
format!(
"When calculating shared mutations against the current best node '{}'",
best_node.payload().name
)
})?;

// Check if the new candidate node is better than the current best
match find_better_node_maybe(graph, best_node, candidate_node, &candidate_split, n_shared_muts) {
let left_muts_score = score_nuc_muts(&candidate_split.left.nuc_muts, masked_ranges, params);
match find_better_node_maybe(graph, best_node, candidate_node, shared_muts_score, left_muts_score) {
None => break,
Some(better_node) => best_node = better_node,
}
Expand All @@ -143,15 +147,17 @@ fn find_shared_muts<'g>(
graph: &'g AuspiceGraph,
best_node: &'g Node<AuspiceGraphNodePayload>,
private_mutations: &BranchMutations,
) -> Result<(&'g Node<AuspiceGraphNodePayload>, SplitMutsResult, usize), Report> {
let (mut candidate_split, mut n_shared_muts) = if best_node.is_root() {
masked_ranges: &[NucRefGlobalRange],
params: &TreeBuilderParams,
) -> Result<(&'g Node<AuspiceGraphNodePayload>, SplitMutsResult, f64), Report> {
let (mut candidate_split, mut shared_muts_score) = if best_node.is_root() {
// Don't include node if node is root as we don't attach nodes above the root
let candidate_split = SplitMutsResult {
left: private_mutations.clone(),
right: BranchMutations::default(),
left: BranchMutations::default(),
right: private_mutations.clone(),
shared: BranchMutations::default(),
};
(candidate_split, 0)
(candidate_split, 0.0)
} else {
let candidate_split = split_muts(&best_node.payload().tmp.private_mutations.invert(), private_mutations)
.wrap_err_with(|| {
Expand All @@ -160,8 +166,8 @@ fn find_shared_muts<'g>(
best_node.payload().name
)
})?;
let n_shared_muts = count_nuc_muts(&candidate_split.shared.nuc_muts);
(candidate_split, n_shared_muts)
let shared_muts_score = score_nuc_muts(&candidate_split.shared.nuc_muts, masked_ranges, params);
(candidate_split, shared_muts_score)
};

// Check all child nodes for shared mutations
Expand All @@ -173,14 +179,14 @@ fn find_shared_muts<'g>(
child.payload().name
)
})?;
let child_n_shared_muts = count_nuc_muts(&child_split.shared.nuc_muts);
if child_n_shared_muts > n_shared_muts {
n_shared_muts = child_n_shared_muts;
let child_shared_muts_score = score_nuc_muts(&child_split.shared.nuc_muts, masked_ranges, params);
if child_shared_muts_score > shared_muts_score {
shared_muts_score = child_shared_muts_score;
candidate_split = child_split;
candidate_node = child;
}
}
Ok((candidate_node, candidate_split, n_shared_muts))
Ok((candidate_node, candidate_split, shared_muts_score))
}

/// Find out if the candidate node is better than the current best (with caveats).
Expand All @@ -189,18 +195,18 @@ fn find_better_node_maybe<'g>(
graph: &'g AuspiceGraph,
best_node: &'g Node<AuspiceGraphNodePayload>,
candidate_node: &'g Node<AuspiceGraphNodePayload>,
candidate_split: &SplitMutsResult,
n_shared_muts: usize,
shared_muts_score: f64,
left_muts_score: f64,
) -> Option<&'g Node<AuspiceGraphNodePayload>> {
if candidate_node == best_node {
// best node is the node itself. Move up the tree if all mutations between
// the candidate node and its parent are also in the private mutations.
// This covers the case where the candidate is a leaf with zero length branch
// as the .left.nuc_muts is emtpy in that case
if candidate_split.left.nuc_muts.is_empty() {
// as the .left.nuc_muts is empty in that case
if left_muts_score == 0.0 {
return graph.parent_of(candidate_node);
}
} else if n_shared_muts > 0 {
} else if shared_muts_score > 0.0 {
// candidate node is child node, move to child node if there are shared mutations
// this should always be the case if the candidate node != best_node
return Some(candidate_node);
Expand Down

0 comments on commit 894409f

Please sign in to comment.