From 6eee1d4dbdd429d19506f48c4fab4eb4f4842295 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 8 Sep 2023 18:02:50 +0200 Subject: [PATCH 01/19] fix: zero length branches can still have aa mutations even without nuc muts --- packages_rs/nextclade/src/tree/tree_builder.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 6880458a64..549cee0a11 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -193,6 +193,12 @@ pub fn finetune_nearest_node( current_best_node.payload().name ) })?; + private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { + format!( + "When calculating union of mutations between query sequence and the candidate child node '{}'", + graph.get_node(best_node.key()).expect("Node not found").payload().name + ) + })?; current_best_node = graph .parent_of_by_key(best_node.key()) .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?; From e4c995d626e91a1c33179633b24cdee00e8ee371 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 8 Sep 2023 18:08:51 +0200 Subject: [PATCH 02/19] fix: adjust error message --- packages_rs/nextclade/src/tree/tree_builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 549cee0a11..63a67c5b3d 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -195,7 +195,7 @@ pub fn finetune_nearest_node( })?; private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { format!( - "When calculating union of mutations between query sequence and the candidate child node '{}'", + "When calculating union of mutations between query sequence the zero-length parent node '{}'", graph.get_node(best_node.key()).expect("Node not found").payload().name ) })?; From 7a11528bce1f85ac2f193c92fa842ae77baa8d91 Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 8 Sep 2023 22:52:41 +0200 Subject: [PATCH 03/19] docs: add comments --- .../nextclade/src/tree/tree_builder.rs | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 63a67c5b3d..20fe8faea1 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -105,7 +105,15 @@ pub fn finetune_nearest_node( let mut current_best_node = graph.get_node(nearest_node_key)?; let mut private_mutations = seq_private_mutations.clone(); + // the following loop moves the new sequences, defined by its set of private mutations + // along the tree starting at the `nearest_node`. As the new sequence is moved, the + // private mutations are updated. This is repeated until the number of private mutations (nuc) + // can not be reduced further by moving the node. At the end of the loop, the nearest node + // is either the closest possible point, or this closest point is along the branch leading + // to the nearest_node. loop { + // in each iteration, check how many mutation are shared with the branch leading to + // the current_best_node or any of its children (loop further below). let mut best_node = current_best_node; let (mut best_split_result, mut n_shared_muts) = if current_best_node.is_root() { // don't include node if node is root as we don't attach nodes above the root @@ -130,6 +138,7 @@ pub fn finetune_nearest_node( (best_split_result, n_shared_muts) }; + // check all child nodes for shared mutations for child in graph.iter_children_of(current_best_node) { let tmp_split_result = split_muts(&child.payload().tmp.private_mutations, &private_mutations).wrap_err_with(|| { @@ -146,6 +155,7 @@ pub fn finetune_nearest_node( } } + // if shared mutations are found, the current_best_node is updated if n_shared_muts > 0 { if best_node.key() == current_best_node.key() && best_split_result.left.nuc_muts.is_empty() { // All mutations from the parent to the node are shared with private mutations. Move up to the parent. @@ -160,16 +170,18 @@ pub fn finetune_nearest_node( // The best node is child current_best_node = graph.get_node(best_node.key())?; } - //subtract the shared mutations from the private mutations struct + // update the private mutations to match the new 'current_best_node'. This involves + // in step 1 subtracting the shared mutations from the private mutations struct private_mutations = difference_of_muts(&private_mutations, &best_split_result.shared).wrap_err_with(|| { format!( "When calculating difference of mutations between query sequence and the candidate child node '{}'", current_best_node.payload().name ) })?; - // add the inverted remaining mutations on that branch - // even if there are no left-over nuc_subs because they are shared, there can be - // changes in the amino acid sequences due to mutations in the same codon that still need handling + // in step 2 we need to add the inverted remaining mutations on that branch. + // Not that this can be necessary even if there are no left-over nuc_subs. + // Amino acid mutations can be decoupled from the their nucleotide mutations or + // changes in the amino acid sequences due to mutations in the same codon still need handling private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { format!( "When calculating union of mutations between query sequence and the candidate child node '{}'", @@ -183,10 +195,8 @@ pub fn finetune_nearest_node( // In this case, a leaf identical to its parent in terms of nuc_subs. this happens when we add // auxiliary nodes. - // Mutation subtraction is still necessary because there might be shared mutations even if there are no `nuc_subs`. - // FIXME: This relies on `is_leaf`. In that case, there is only one entry in `shared_muts_neighbors` - // and the `max_shared_muts` is automatically the `current_best_node.key()`. Less error prone would be - // to fetch the shared muts corresponding to current_best_node.key() + // Mutation subtraction is still necessary because there might be shared mutations + // even if there are no `nuc_subs`. private_mutations = difference_of_muts(&private_mutations, &best_split_result.shared).wrap_err_with(|| { format!( "When subtracting mutations from zero-length parent node '{}'", @@ -195,7 +205,7 @@ pub fn finetune_nearest_node( })?; private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { format!( - "When calculating union of mutations between query sequence the zero-length parent node '{}'", + "When calculating union of mutations between the query sequence and the zero-length parent node '{}'", graph.get_node(best_node.key()).expect("Node not found").payload().name ) })?; From b19e40274299805875b77287e404e981fccc7edb Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 8 Sep 2023 22:54:45 +0200 Subject: [PATCH 04/19] refactor: simplify node name retrieval --- packages_rs/nextclade/src/tree/tree_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 20fe8faea1..66c997ad47 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -185,7 +185,7 @@ pub fn finetune_nearest_node( private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { format!( "When calculating union of mutations between query sequence and the candidate child node '{}'", - graph.get_node(best_node.key()).expect("Node not found").payload().name + best_node.payload().name ) })?; } else if current_best_node.is_leaf() @@ -206,7 +206,7 @@ pub fn finetune_nearest_node( private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { format!( "When calculating union of mutations between the query sequence and the zero-length parent node '{}'", - graph.get_node(best_node.key()).expect("Node not found").payload().name + best_node.payload().name ) })?; current_best_node = graph From 15757ece18f4e57931c27306e18c2ae229b2cfdb Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Fri, 8 Sep 2023 22:59:54 +0200 Subject: [PATCH 05/19] refactor: reduce code duplication in tree fine-tuning --- .../nextclade/src/tree/tree_builder.rs | 53 +++++++------------ 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 66c997ad47..5dc31e6386 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -170,51 +170,34 @@ pub fn finetune_nearest_node( // The best node is child current_best_node = graph.get_node(best_node.key())?; } - // update the private mutations to match the new 'current_best_node'. This involves - // in step 1 subtracting the shared mutations from the private mutations struct - private_mutations = difference_of_muts(&private_mutations, &best_split_result.shared).wrap_err_with(|| { - format!( - "When calculating difference of mutations between query sequence and the candidate child node '{}'", - current_best_node.payload().name - ) - })?; - // in step 2 we need to add the inverted remaining mutations on that branch. - // Not that this can be necessary even if there are no left-over nuc_subs. - // Amino acid mutations can be decoupled from the their nucleotide mutations or - // changes in the amino acid sequences due to mutations in the same codon still need handling - private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { - format!( - "When calculating union of mutations between query sequence and the candidate child node '{}'", - best_node.payload().name - ) - })?; } else if current_best_node.is_leaf() && !current_best_node.is_root() && current_best_node.payload().tmp.private_mutations.nuc_muts.is_empty() { - // In this case, a leaf identical to its parent in terms of nuc_subs. this happens when we add - // auxiliary nodes. - - // Mutation subtraction is still necessary because there might be shared mutations - // even if there are no `nuc_subs`. - private_mutations = difference_of_muts(&private_mutations, &best_split_result.shared).wrap_err_with(|| { - format!( - "When subtracting mutations from zero-length parent node '{}'", - current_best_node.payload().name - ) - })?; - private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { - format!( - "When calculating union of mutations between the query sequence and the zero-length parent node '{}'", - best_node.payload().name - ) - })?; current_best_node = graph .parent_of_by_key(best_node.key()) .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?; } else { break; } + // update the private mutations to match the new 'current_best_node'. This involves + // in step 1 subtracting the shared mutations from the private mutations struct + private_mutations = difference_of_muts(&private_mutations, &best_split_result.shared).wrap_err_with(|| { + format!( + "When calculating difference of mutations between query sequence and the branch leading to the next attachment point '{}'", + current_best_node.payload().name + ) + })?; + // in step 2 we need to add the inverted remaining mutations on that branch. + // Not that this can be necessary even if there are no left-over nuc_subs. + // Amino acid mutations can be decoupled from the their nucleotide mutations or + // changes in the amino acid sequences due to mutations in the same codon still need handling + private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { + format!( + "When calculating union of mutations between query sequence and the branch leading to the next attachment point '{}'", + best_node.payload().name + ) + })?; } Ok((current_best_node.key(), private_mutations)) } From 491100d44eaa108e7d5ef4ec20fdc0f4bb7af858 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 05:46:42 +0200 Subject: [PATCH 06/19] refactor: extract find_shared_muts() function --- .../nextclade/src/tree/tree_builder.rs | 91 ++++++++++--------- 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 5dc31e6386..329411e8ab 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -4,7 +4,7 @@ use crate::analyze::divergence::{calculate_branch_length, count_nuc_muts}; use crate::analyze::find_private_nuc_mutations::BranchMutations; use crate::analyze::nuc_del::NucDel; use crate::analyze::nuc_sub::NucSub; -use crate::graph::node::GraphNodeKey; +use crate::graph::node::{GraphNodeKey, Node}; use crate::make_internal_report; use crate::tree::params::TreeBuilderParams; use crate::tree::split_muts::{difference_of_muts, split_muts, union_of_muts, SplitMutsResult}; @@ -114,46 +114,7 @@ pub fn finetune_nearest_node( loop { // in each iteration, check how many mutation are shared with the branch leading to // the current_best_node or any of its children (loop further below). - let mut best_node = current_best_node; - let (mut best_split_result, mut n_shared_muts) = if current_best_node.is_root() { - // don't include node if node is root as we don't attach nodes above the root - let best_split_result = SplitMutsResult { - left: private_mutations.clone(), - right: BranchMutations::default(), - shared: BranchMutations::default(), - }; - (best_split_result, 0) - } else { - let best_split_result = split_muts( - ¤t_best_node.payload().tmp.private_mutations.invert(), - &private_mutations, - ) - .wrap_err_with(|| { - format!( - "When splitting mutations between query sequence and the nearest node '{}'", - current_best_node.payload().name - ) - })?; - let n_shared_muts = count_nuc_muts(&best_split_result.shared.nuc_muts); - (best_split_result, n_shared_muts) - }; - - // check all child nodes for shared mutations - for child in graph.iter_children_of(current_best_node) { - let tmp_split_result = - split_muts(&child.payload().tmp.private_mutations, &private_mutations).wrap_err_with(|| { - format!( - "When splitting mutations between query sequence and the child node '{}'", - child.payload().name - ) - })?; - let tmp_n_shared_muts = count_nuc_muts(&tmp_split_result.shared.nuc_muts); - if tmp_n_shared_muts > n_shared_muts { - n_shared_muts = tmp_n_shared_muts; - best_split_result = tmp_split_result; - best_node = child; - } - } + let (best_node, best_split_result, n_shared_muts) = find_shared_muts(graph, current_best_node, &private_mutations)?; // if shared mutations are found, the current_best_node is updated if n_shared_muts > 0 { @@ -202,6 +163,54 @@ pub fn finetune_nearest_node( Ok((current_best_node.key(), private_mutations)) } +fn find_shared_muts<'g>( + graph: &'g AuspiceGraph, + current_best_node: &'g Node, + private_mutations: &BranchMutations, +) -> Result<(&'g Node, SplitMutsResult, usize), Report> { + let mut best_node = current_best_node; + let (mut best_split_result, mut n_shared_muts) = if current_best_node.is_root() { + // don't include node if node is root as we don't attach nodes above the root + let best_split_result = SplitMutsResult { + left: private_mutations.clone(), + right: BranchMutations::default(), + shared: BranchMutations::default(), + }; + (best_split_result, 0) + } else { + let best_split_result = split_muts( + ¤t_best_node.payload().tmp.private_mutations.invert(), + private_mutations, + ) + .wrap_err_with(|| { + format!( + "When splitting mutations between query sequence and the nearest node '{}'", + current_best_node.payload().name + ) + })?; + let n_shared_muts = count_nuc_muts(&best_split_result.shared.nuc_muts); + (best_split_result, n_shared_muts) + }; + + // check all child nodes for shared mutations + for child in graph.iter_children_of(current_best_node) { + let tmp_split_result = + split_muts(&child.payload().tmp.private_mutations, private_mutations).wrap_err_with(|| { + format!( + "When splitting mutations between query sequence and the child node '{}'", + child.payload().name + ) + })?; + let tmp_n_shared_muts = count_nuc_muts(&tmp_split_result.shared.nuc_muts); + if tmp_n_shared_muts > n_shared_muts { + n_shared_muts = tmp_n_shared_muts; + best_split_result = tmp_split_result; + best_node = child; + } + } + Ok((best_node, best_split_result, n_shared_muts)) +} + pub fn attach_to_internal_node( graph: &mut AuspiceGraph, nearest_node_id: GraphNodeKey, From c1567c6d48923a17a76c76349a37bb024f499146 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 07:06:35 +0200 Subject: [PATCH 07/19] refactor: extract update_private_mutations() function --- .../nextclade/src/tree/tree_builder.rs | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 329411e8ab..bca54e8832 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -141,24 +141,8 @@ pub fn finetune_nearest_node( } else { break; } - // update the private mutations to match the new 'current_best_node'. This involves - // in step 1 subtracting the shared mutations from the private mutations struct - private_mutations = difference_of_muts(&private_mutations, &best_split_result.shared).wrap_err_with(|| { - format!( - "When calculating difference of mutations between query sequence and the branch leading to the next attachment point '{}'", - current_best_node.payload().name - ) - })?; - // in step 2 we need to add the inverted remaining mutations on that branch. - // Not that this can be necessary even if there are no left-over nuc_subs. - // Amino acid mutations can be decoupled from the their nucleotide mutations or - // changes in the amino acid sequences due to mutations in the same codon still need handling - private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { - format!( - "When calculating union of mutations between query sequence and the branch leading to the next attachment point '{}'", - best_node.payload().name - ) - })?; + + private_mutations = update_private_mutations(&private_mutations, current_best_node, &best_split_result)?; } Ok((current_best_node.key(), private_mutations)) } @@ -211,6 +195,34 @@ fn find_shared_muts<'g>( Ok((best_node, best_split_result, n_shared_muts)) } +/// Update private mutations to match the new best_node +fn update_private_mutations( + private_mutations: &BranchMutations, + current_best_node: &Node, + best_split_result: &SplitMutsResult, +) -> Result { + // Step 1: subtract shared mutations from private mutations + let private_mutations = difference_of_muts(private_mutations, &best_split_result.shared).wrap_err_with(|| { + format!( + "When calculating difference of mutations between query sequence and the branch leading to the next attachment point '{}'", + current_best_node.payload().name + ) + })?; + + // Step 2: We need to add the inverted remaining mutations on that branch. + // Note that this can be necessary even if there are no left-over nuc_subs. + // Amino acid mutations can be decoupled from the their nucleotide mutations or + // changes in the amino acid sequences due to mutations in the same codon still need handling. + let private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { + format!( + "When calculating union of mutations between query sequence and the branch leading to the next attachment point '{}'", + current_best_node.payload().name + ) + })?; + + Ok(private_mutations) +} + pub fn attach_to_internal_node( graph: &mut AuspiceGraph, nearest_node_id: GraphNodeKey, From 309fc377b13bea43dc77802c283f191d555d8bdc Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 07:23:19 +0200 Subject: [PATCH 08/19] refactor: extract find_better_node() function --- .../nextclade/src/tree/tree_builder.rs | 65 ++++++++++++------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index bca54e8832..b5d3d1fbc7 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -116,30 +116,9 @@ pub fn finetune_nearest_node( // the current_best_node or any of its children (loop further below). let (best_node, best_split_result, n_shared_muts) = find_shared_muts(graph, current_best_node, &private_mutations)?; - // if shared mutations are found, the current_best_node is updated - if n_shared_muts > 0 { - if best_node.key() == current_best_node.key() && best_split_result.left.nuc_muts.is_empty() { - // All mutations from the parent to the node are shared with private mutations. Move up to the parent. - // FIXME: what if there's no parent? - current_best_node = graph - .parent_of_by_key(best_node.key()) - .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?; - } else if best_node.key() == current_best_node.key() { - // The best node is the current node. Break. - break; - } else { - // The best node is child - current_best_node = graph.get_node(best_node.key())?; - } - } else if current_best_node.is_leaf() - && !current_best_node.is_root() - && current_best_node.payload().tmp.private_mutations.nuc_muts.is_empty() - { - current_best_node = graph - .parent_of_by_key(best_node.key()) - .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?; - } else { - break; + match find_better_node(graph, current_best_node, best_node, &best_split_result, n_shared_muts)? { + None => break, + Some(better_node) => current_best_node = better_node, } private_mutations = update_private_mutations(&private_mutations, current_best_node, &best_split_result)?; @@ -147,6 +126,44 @@ pub fn finetune_nearest_node( Ok((current_best_node.key(), private_mutations)) } +fn find_better_node<'g>( + graph: &'g AuspiceGraph, + current_best_node: &'g Node, + best_node: &'g Node, + best_split_result: &SplitMutsResult, + n_shared_muts: usize, +) -> Result>, Report> { + // if shared mutations are found, the current_best_node is updated + Ok(if n_shared_muts > 0 { + if best_node.key() == current_best_node.key() && best_split_result.left.nuc_muts.is_empty() { + // All mutations from the parent to the node are shared with private mutations. Move up to the parent. + // FIXME: what if there's no parent? + Some( + graph + .parent_of(best_node) + .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, + ) + } else if best_node.key() == current_best_node.key() { + // The best node is the current node. Break. + None + } else { + // The best node is child + Some(graph.get_node(best_node.key())?) + } + } else if current_best_node.is_leaf() + && !current_best_node.is_root() + && current_best_node.payload().tmp.private_mutations.nuc_muts.is_empty() + { + Some( + graph + .parent_of(best_node) + .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, + ) + } else { + None + }) +} + fn find_shared_muts<'g>( graph: &'g AuspiceGraph, current_best_node: &'g Node, From 1c2c00ed00f5d42622d2efb6b3b5a7ce35bc7960 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 07:35:19 +0200 Subject: [PATCH 09/19] refactor: extract find_better_node_maybe() function --- .../nextclade/src/tree/tree_builder.rs | 83 ++++++++++--------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index b5d3d1fbc7..df4f269523 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -116,52 +116,17 @@ pub fn finetune_nearest_node( // the current_best_node or any of its children (loop further below). let (best_node, best_split_result, n_shared_muts) = find_shared_muts(graph, current_best_node, &private_mutations)?; - match find_better_node(graph, current_best_node, best_node, &best_split_result, n_shared_muts)? { + // Check if the new candidate node is better than the current best + match find_better_node_maybe(graph, current_best_node, best_node, &best_split_result, n_shared_muts)? { None => break, Some(better_node) => current_best_node = better_node, } + // Update query mutations to adjust for its new position private_mutations = update_private_mutations(&private_mutations, current_best_node, &best_split_result)?; } - Ok((current_best_node.key(), private_mutations)) -} -fn find_better_node<'g>( - graph: &'g AuspiceGraph, - current_best_node: &'g Node, - best_node: &'g Node, - best_split_result: &SplitMutsResult, - n_shared_muts: usize, -) -> Result>, Report> { - // if shared mutations are found, the current_best_node is updated - Ok(if n_shared_muts > 0 { - if best_node.key() == current_best_node.key() && best_split_result.left.nuc_muts.is_empty() { - // All mutations from the parent to the node are shared with private mutations. Move up to the parent. - // FIXME: what if there's no parent? - Some( - graph - .parent_of(best_node) - .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, - ) - } else if best_node.key() == current_best_node.key() { - // The best node is the current node. Break. - None - } else { - // The best node is child - Some(graph.get_node(best_node.key())?) - } - } else if current_best_node.is_leaf() - && !current_best_node.is_root() - && current_best_node.payload().tmp.private_mutations.nuc_muts.is_empty() - { - Some( - graph - .parent_of(best_node) - .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, - ) - } else { - None - }) + Ok((current_best_node.key(), private_mutations)) } fn find_shared_muts<'g>( @@ -212,6 +177,46 @@ fn find_shared_muts<'g>( Ok((best_node, best_split_result, n_shared_muts)) } +/// Find out if the candidate node is better than the current best (with caveats) +/// Return a better node or `None` (if the current best node is to be preserved). +fn find_better_node_maybe<'g>( + graph: &'g AuspiceGraph, + current_best_node: &'g Node, + best_node: &'g Node, + best_split_result: &SplitMutsResult, + n_shared_muts: usize, +) -> Result>, Report> { + // if shared mutations are found, the current_best_node is updated + Ok(if n_shared_muts > 0 { + if best_node.key() == current_best_node.key() && best_split_result.left.nuc_muts.is_empty() { + // Caveat: all mutations from the parent to the node are shared with private mutations. Move up to the parent. + // FIXME: what if there's no parent? + Some( + graph + .parent_of(best_node) + .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, + ) + } else if best_node.key() == current_best_node.key() { + // The best node is the current node. Break. + None + } else { + // The best node is child + Some(graph.get_node(best_node.key())?) + } + } else if current_best_node.is_leaf() + && !current_best_node.is_root() + && current_best_node.payload().tmp.private_mutations.nuc_muts.is_empty() + { + Some( + graph + .parent_of(best_node) + .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, + ) + } else { + None + }) +} + /// Update private mutations to match the new best_node fn update_private_mutations( private_mutations: &BranchMutations, From 2ac268eb3fa7729ede5e88229ebd0985c0fbe97b Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 07:37:54 +0200 Subject: [PATCH 10/19] feat: break from loop if no parent node, rather than throwing --- packages_rs/nextclade/src/tree/tree_builder.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index df4f269523..65ee7c66dd 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -191,11 +191,7 @@ fn find_better_node_maybe<'g>( if best_node.key() == current_best_node.key() && best_split_result.left.nuc_muts.is_empty() { // Caveat: all mutations from the parent to the node are shared with private mutations. Move up to the parent. // FIXME: what if there's no parent? - Some( - graph - .parent_of(best_node) - .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, - ) + graph.parent_of(best_node) } else if best_node.key() == current_best_node.key() { // The best node is the current node. Break. None @@ -207,11 +203,7 @@ fn find_better_node_maybe<'g>( && !current_best_node.is_root() && current_best_node.payload().tmp.private_mutations.nuc_muts.is_empty() { - Some( - graph - .parent_of(best_node) - .ok_or_else(|| make_internal_report!("Parent node is expected, but not found"))?, - ) + graph.parent_of(best_node) } else { None }) From 71c61eeb2a40a275a44fee8a1df534a6ab4ac4aa Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 07:39:42 +0200 Subject: [PATCH 11/19] fix: copypasted strings --- packages_rs/nextclade/src/tree/split_muts.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages_rs/nextclade/src/tree/split_muts.rs b/packages_rs/nextclade/src/tree/split_muts.rs index 50493824e5..3057150cc5 100644 --- a/packages_rs/nextclade/src/tree/split_muts.rs +++ b/packages_rs/nextclade/src/tree/split_muts.rs @@ -244,9 +244,9 @@ where pub fn difference_of_muts(left: &BranchMutations, right: &BranchMutations) -> Result { Ok(BranchMutations { nuc_muts: difference(&left.nuc_muts, &right.nuc_muts) - .wrap_err("When calculating union of private nucleotide substitutions")?, + .wrap_err("When calculating difference of private nucleotide substitutions")?, aa_muts: difference_of_aa_muts(&left.aa_muts, &right.aa_muts) - .wrap_err("When calculating union of private aminoacid mutations")?, + .wrap_err("When calculating difference of private aminoacid mutations")?, }) } From b4f42142718c23cffdc19a08a7b9db26fb8633bf Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 07:43:50 +0200 Subject: [PATCH 12/19] refactor: simplify comparison --- packages_rs/nextclade/src/tree/tree_builder.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 65ee7c66dd..8e931e2790 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -5,7 +5,6 @@ use crate::analyze::find_private_nuc_mutations::BranchMutations; use crate::analyze::nuc_del::NucDel; use crate::analyze::nuc_sub::NucSub; use crate::graph::node::{GraphNodeKey, Node}; -use crate::make_internal_report; use crate::tree::params::TreeBuilderParams; use crate::tree::split_muts::{difference_of_muts, split_muts, union_of_muts, SplitMutsResult}; use crate::tree::tree::{AuspiceGraph, AuspiceGraphEdgePayload, AuspiceGraphNodePayload, TreeBranchAttrsLabels}; @@ -188,11 +187,10 @@ fn find_better_node_maybe<'g>( ) -> Result>, Report> { // if shared mutations are found, the current_best_node is updated Ok(if n_shared_muts > 0 { - if best_node.key() == current_best_node.key() && best_split_result.left.nuc_muts.is_empty() { + if best_node == current_best_node && best_split_result.left.nuc_muts.is_empty() { // Caveat: all mutations from the parent to the node are shared with private mutations. Move up to the parent. - // FIXME: what if there's no parent? graph.parent_of(best_node) - } else if best_node.key() == current_best_node.key() { + } else if best_node == current_best_node { // The best node is the current node. Break. None } else { From c574491a25dae61d07d299cf8ca090f0bc8ed9b6 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 07:47:55 +0200 Subject: [PATCH 13/19] refactor: update comments --- .../nextclade/src/tree/tree_builder.rs | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 8e931e2790..a3e5de4645 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -96,6 +96,12 @@ pub fn graph_attach_new_node_in_place( Ok(()) } +/// Moves the new sequences, defined by its set of private mutations +/// along the tree starting at the `nearest_node`. As the new sequence is moved, the +/// private mutations are updated. This is repeated until the number of private mutations (nuc) +/// can not be reduced further by moving the node. At the end of the loop, the nearest node +/// is either the closest possible point, or this closest point is along the branch leading +/// to the nearest_node. pub fn finetune_nearest_node( graph: &AuspiceGraph, nearest_node_key: GraphNodeKey, @@ -104,15 +110,8 @@ pub fn finetune_nearest_node( let mut current_best_node = graph.get_node(nearest_node_key)?; let mut private_mutations = seq_private_mutations.clone(); - // the following loop moves the new sequences, defined by its set of private mutations - // along the tree starting at the `nearest_node`. As the new sequence is moved, the - // private mutations are updated. This is repeated until the number of private mutations (nuc) - // can not be reduced further by moving the node. At the end of the loop, the nearest node - // is either the closest possible point, or this closest point is along the branch leading - // to the nearest_node. loop { - // in each iteration, check how many mutation are shared with the branch leading to - // the current_best_node or any of its children (loop further below). + // Check how many mutations are shared with the branch leading to the current_best_node or any of its children let (best_node, best_split_result, n_shared_muts) = find_shared_muts(graph, current_best_node, &private_mutations)?; // Check if the new candidate node is better than the current best @@ -121,21 +120,23 @@ pub fn finetune_nearest_node( Some(better_node) => current_best_node = better_node, } - // Update query mutations to adjust for its new position + // Update query mutations to adjust for the new position of the placed node private_mutations = update_private_mutations(&private_mutations, current_best_node, &best_split_result)?; } Ok((current_best_node.key(), private_mutations)) } +/// Check how many mutations are shared with the branch leading to the current_best_node or any of its children fn find_shared_muts<'g>( graph: &'g AuspiceGraph, current_best_node: &'g Node, private_mutations: &BranchMutations, ) -> Result<(&'g Node, SplitMutsResult, usize), Report> { let mut best_node = current_best_node; + let (mut best_split_result, mut n_shared_muts) = if current_best_node.is_root() { - // don't include node if node is root as we don't attach nodes above the root + // Don't include node if node is root as we don't attach nodes above the root let best_split_result = SplitMutsResult { left: private_mutations.clone(), right: BranchMutations::default(), @@ -157,7 +158,7 @@ fn find_shared_muts<'g>( (best_split_result, n_shared_muts) }; - // check all child nodes for shared mutations + // Check all child nodes for shared mutations for child in graph.iter_children_of(current_best_node) { let tmp_split_result = split_muts(&child.payload().tmp.private_mutations, private_mutations).wrap_err_with(|| { @@ -176,7 +177,7 @@ fn find_shared_muts<'g>( Ok((best_node, best_split_result, n_shared_muts)) } -/// Find out if the candidate node is better than the current best (with caveats) +/// Find out if the candidate node is better than the current best (with caveats). /// Return a better node or `None` (if the current best node is to be preserved). fn find_better_node_maybe<'g>( graph: &'g AuspiceGraph, @@ -207,7 +208,7 @@ fn find_better_node_maybe<'g>( }) } -/// Update private mutations to match the new best_node +/// Update private mutations to match the new best node fn update_private_mutations( private_mutations: &BranchMutations, current_best_node: &Node, From 0cb9c0181c9f3a518f7105032adad68973af0999 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 08:05:51 +0200 Subject: [PATCH 14/19] feat: improve error handling --- .../nextclade/src/tree/tree_builder.rs | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index a3e5de4645..b6d7b3d7d0 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -112,16 +112,27 @@ pub fn finetune_nearest_node( loop { // Check how many mutations are shared with the branch leading to the current_best_node or any of its children - let (best_node, best_split_result, n_shared_muts) = find_shared_muts(graph, current_best_node, &private_mutations)?; + let (best_node, best_split_result, n_shared_muts) = find_shared_muts(graph, current_best_node, &private_mutations) + .wrap_err_with(|| { + format!( + "When calculating shared mutations against the current best node '{}'", + current_best_node.payload().name + ) + })?; // Check if the new candidate node is better than the current best - match find_better_node_maybe(graph, current_best_node, best_node, &best_split_result, n_shared_muts)? { + match find_better_node_maybe(graph, current_best_node, best_node, &best_split_result, n_shared_muts) { None => break, Some(better_node) => current_best_node = better_node, } // Update query mutations to adjust for the new position of the placed node - private_mutations = update_private_mutations(&private_mutations, current_best_node, &best_split_result)?; + private_mutations = update_private_mutations(&private_mutations, &best_split_result).wrap_err_with(|| { + format!( + "When updating private mutations against the current best node '{}'", + current_best_node.payload().name + ) + })?; } Ok((current_best_node.key(), private_mutations)) @@ -185,9 +196,9 @@ fn find_better_node_maybe<'g>( best_node: &'g Node, best_split_result: &SplitMutsResult, n_shared_muts: usize, -) -> Result>, Report> { +) -> Option<&'g Node> { // if shared mutations are found, the current_best_node is updated - Ok(if n_shared_muts > 0 { + if n_shared_muts > 0 { if best_node == current_best_node && best_split_result.left.nuc_muts.is_empty() { // Caveat: all mutations from the parent to the node are shared with private mutations. Move up to the parent. graph.parent_of(best_node) @@ -196,7 +207,7 @@ fn find_better_node_maybe<'g>( None } else { // The best node is child - Some(graph.get_node(best_node.key())?) + Some(best_node) } } else if current_best_node.is_leaf() && !current_best_node.is_root() @@ -205,33 +216,26 @@ fn find_better_node_maybe<'g>( graph.parent_of(best_node) } else { None - }) + } } /// Update private mutations to match the new best node fn update_private_mutations( private_mutations: &BranchMutations, - current_best_node: &Node, best_split_result: &SplitMutsResult, ) -> Result { // Step 1: subtract shared mutations from private mutations - let private_mutations = difference_of_muts(private_mutations, &best_split_result.shared).wrap_err_with(|| { - format!( - "When calculating difference of mutations between query sequence and the branch leading to the next attachment point '{}'", - current_best_node.payload().name - ) - })?; + let private_mutations = difference_of_muts(private_mutations, &best_split_result.shared).wrap_err( + "When calculating difference of mutations between query sequence and the branch leading to the next attachment point" + )?; // Step 2: We need to add the inverted remaining mutations on that branch. // Note that this can be necessary even if there are no left-over nuc_subs. // Amino acid mutations can be decoupled from the their nucleotide mutations or // changes in the amino acid sequences due to mutations in the same codon still need handling. - let private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err_with(|| { - format!( - "When calculating union of mutations between query sequence and the branch leading to the next attachment point '{}'", - current_best_node.payload().name - ) - })?; + let private_mutations = union_of_muts(&private_mutations, &best_split_result.left.invert()).wrap_err( + "When calculating union of mutations between query sequence and the branch leading to the next attachment point.", + )?; Ok(private_mutations) } From a44299945d47ae77d9115059bbbbb40564509c1f Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 08:33:02 +0200 Subject: [PATCH 15/19] refactor: change terminology of var names in tree finetuning --- .../nextclade/src/tree/tree_builder.rs | 93 +++++++++---------- 1 file changed, 43 insertions(+), 50 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index b6d7b3d7d0..b051f101d2 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -107,113 +107,106 @@ pub fn finetune_nearest_node( nearest_node_key: GraphNodeKey, seq_private_mutations: &BranchMutations, ) -> Result<(GraphNodeKey, BranchMutations), Report> { - let mut current_best_node = graph.get_node(nearest_node_key)?; + let mut best_node = graph.get_node(nearest_node_key)?; let mut private_mutations = seq_private_mutations.clone(); loop { // Check how many mutations are shared with the branch leading to the current_best_node or any of its children - let (best_node, best_split_result, n_shared_muts) = find_shared_muts(graph, current_best_node, &private_mutations) + let (candidate_node, candidate_split, n_shared_muts) = find_shared_muts(graph, best_node, &private_mutations) .wrap_err_with(|| { format!( "When calculating shared mutations against the current best node '{}'", - current_best_node.payload().name + best_node.payload().name ) })?; // Check if the new candidate node is better than the current best - match find_better_node_maybe(graph, current_best_node, best_node, &best_split_result, n_shared_muts) { + match find_better_node_maybe(graph, best_node, candidate_node, &candidate_split, n_shared_muts) { None => break, - Some(better_node) => current_best_node = better_node, + Some(better_node) => best_node = better_node, } // Update query mutations to adjust for the new position of the placed node - private_mutations = update_private_mutations(&private_mutations, &best_split_result).wrap_err_with(|| { + private_mutations = update_private_mutations(&private_mutations, &candidate_split).wrap_err_with(|| { format!( "When updating private mutations against the current best node '{}'", - current_best_node.payload().name + best_node.payload().name ) })?; } - Ok((current_best_node.key(), private_mutations)) + Ok((best_node.key(), private_mutations)) } /// Check how many mutations are shared with the branch leading to the current_best_node or any of its children fn find_shared_muts<'g>( graph: &'g AuspiceGraph, - current_best_node: &'g Node, + best_node: &'g Node, private_mutations: &BranchMutations, ) -> Result<(&'g Node, SplitMutsResult, usize), Report> { - let mut best_node = current_best_node; - - let (mut best_split_result, mut n_shared_muts) = if current_best_node.is_root() { + let (mut candidate_split, mut n_shared_muts) = if best_node.is_root() { // Don't include node if node is root as we don't attach nodes above the root - let best_split_result = SplitMutsResult { + let candidate_split = SplitMutsResult { left: private_mutations.clone(), right: BranchMutations::default(), shared: BranchMutations::default(), }; - (best_split_result, 0) + (candidate_split, 0) } else { - let best_split_result = split_muts( - ¤t_best_node.payload().tmp.private_mutations.invert(), - private_mutations, - ) - .wrap_err_with(|| { - format!( - "When splitting mutations between query sequence and the nearest node '{}'", - current_best_node.payload().name - ) - })?; - let n_shared_muts = count_nuc_muts(&best_split_result.shared.nuc_muts); - (best_split_result, n_shared_muts) - }; - - // Check all child nodes for shared mutations - for child in graph.iter_children_of(current_best_node) { - let tmp_split_result = - split_muts(&child.payload().tmp.private_mutations, private_mutations).wrap_err_with(|| { + let candidate_split = split_muts(&best_node.payload().tmp.private_mutations.invert(), private_mutations) + .wrap_err_with(|| { format!( - "When splitting mutations between query sequence and the child node '{}'", - child.payload().name + "When splitting mutations between query sequence and the nearest node '{}'", + best_node.payload().name ) })?; - let tmp_n_shared_muts = count_nuc_muts(&tmp_split_result.shared.nuc_muts); - if tmp_n_shared_muts > n_shared_muts { - n_shared_muts = tmp_n_shared_muts; - best_split_result = tmp_split_result; - best_node = child; + let n_shared_muts = count_nuc_muts(&candidate_split.shared.nuc_muts); + (candidate_split, n_shared_muts) + }; + + // Check all child nodes for shared mutations + let mut candidate_node = best_node; + for child in graph.iter_children_of(best_node) { + let child_split = split_muts(&child.payload().tmp.private_mutations, private_mutations).wrap_err_with(|| { + format!( + "When splitting mutations between query sequence and the child node '{}'", + child.payload().name + ) + })?; + let child_n_shared_muts = count_nuc_muts(&child_split.shared.nuc_muts); + if child_n_shared_muts > n_shared_muts { + n_shared_muts = child_n_shared_muts; + candidate_split = child_split; + candidate_node = child; } } - Ok((best_node, best_split_result, n_shared_muts)) + Ok((candidate_node, candidate_split, n_shared_muts)) } /// Find out if the candidate node is better than the current best (with caveats). /// Return a better node or `None` (if the current best node is to be preserved). fn find_better_node_maybe<'g>( graph: &'g AuspiceGraph, - current_best_node: &'g Node, best_node: &'g Node, - best_split_result: &SplitMutsResult, + candidate_node: &'g Node, + candidate_split: &SplitMutsResult, n_shared_muts: usize, ) -> Option<&'g Node> { // if shared mutations are found, the current_best_node is updated if n_shared_muts > 0 { - if best_node == current_best_node && best_split_result.left.nuc_muts.is_empty() { + if candidate_node == best_node && candidate_split.left.nuc_muts.is_empty() { // Caveat: all mutations from the parent to the node are shared with private mutations. Move up to the parent. - graph.parent_of(best_node) - } else if best_node == current_best_node { + graph.parent_of(candidate_node) + } else if candidate_node == best_node { // The best node is the current node. Break. None } else { // The best node is child - Some(best_node) + Some(candidate_node) } - } else if current_best_node.is_leaf() - && !current_best_node.is_root() - && current_best_node.payload().tmp.private_mutations.nuc_muts.is_empty() + } else if best_node.is_leaf() && !best_node.is_root() && best_node.payload().tmp.private_mutations.nuc_muts.is_empty() { - graph.parent_of(best_node) + graph.parent_of(candidate_node) } else { None } From 52dd4f87b77789c3f4503ffe2bfdb5f821e127fc Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 11 Sep 2023 15:01:08 +0200 Subject: [PATCH 16/19] refactor: simplify find_better_node_maybe by checking first whether the candidate_node equals the current best node, the set of conditionals gets simplified and multiple previous conditions are combined into one. There are now three possible results: - move to the parent node. Happens whenever best_node==candidate_node and all mutations that lead to the candidate_node are also found in the private mutations of best_node - move to a child (candidate_node!=best_node). This only happens when there is at least one mutation shared. - stay: in this case, the final placing is either a direct child of candidate_node, or splits the branch leading to it. --- .../nextclade/src/tree/tree_builder.rs | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index b051f101d2..04616d660c 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -192,24 +192,21 @@ fn find_better_node_maybe<'g>( candidate_split: &SplitMutsResult, n_shared_muts: usize, ) -> Option<&'g Node> { - // if shared mutations are found, the current_best_node is updated - if n_shared_muts > 0 { - if candidate_node == best_node && candidate_split.left.nuc_muts.is_empty() { - // Caveat: all mutations from the parent to the node are shared with private mutations. Move up to the parent. - graph.parent_of(candidate_node) - } else if candidate_node == best_node { - // The best node is the current node. Break. - None - } else { - // The best node is child - Some(candidate_node) + if candidate_node == best_node { + // best node is the node itself. Move up the tree if all mutations between + // the candidate node and its parent are also in the private mutations. + // This covers the case where the candidate is a leaf with zero length branch + // as the .left.nuc_muts is emtpy in that case + if candidate_split.left.nuc_muts.is_empty() { + return graph.parent_of(candidate_node); } - } else if best_node.is_leaf() && !best_node.is_root() && best_node.payload().tmp.private_mutations.nuc_muts.is_empty() - { - graph.parent_of(candidate_node) - } else { - None + } else if n_shared_muts > 0 { + // candidate node is child node, move to child node if there are shared mutations + // this should always be the case if the candidate node != best_node + return Some(candidate_node); } + // no improvement possible. Return None to stay + None } /// Update private mutations to match the new best node From 41f79fb6d04b29e1c1765fdd60b222717f07bd1c Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 11 Sep 2023 15:56:30 +0200 Subject: [PATCH 17/19] fix: only consider acgt mutations when moving up the tree - replace `.left.nuc_muts.is_empty()` with a check for `n_left_muts==0` which is calculated using the same function as `n_shared_muts`. - fix left/right error is split result when attaching to the root --- packages_rs/nextclade/src/tree/tree_builder.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 04616d660c..a6e992a3ad 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -121,7 +121,8 @@ pub fn finetune_nearest_node( })?; // Check if the new candidate node is better than the current best - match find_better_node_maybe(graph, best_node, candidate_node, &candidate_split, n_shared_muts) { + let n_left_muts = count_nuc_muts(&candidate_split.left.nuc_muts); + match find_better_node_maybe(graph, best_node, candidate_node, n_shared_muts, n_left_muts) { None => break, Some(better_node) => best_node = better_node, } @@ -147,8 +148,8 @@ fn find_shared_muts<'g>( let (mut candidate_split, mut n_shared_muts) = if best_node.is_root() { // Don't include node if node is root as we don't attach nodes above the root let candidate_split = SplitMutsResult { - left: private_mutations.clone(), - right: BranchMutations::default(), + left: BranchMutations::default(), + right: private_mutations.clone(), shared: BranchMutations::default(), }; (candidate_split, 0) @@ -189,15 +190,15 @@ fn find_better_node_maybe<'g>( graph: &'g AuspiceGraph, best_node: &'g Node, candidate_node: &'g Node, - candidate_split: &SplitMutsResult, n_shared_muts: usize, + n_left_muts: usize, ) -> Option<&'g Node> { if candidate_node == best_node { // best node is the node itself. Move up the tree if all mutations between // the candidate node and its parent are also in the private mutations. // This covers the case where the candidate is a leaf with zero length branch // as the .left.nuc_muts is emtpy in that case - if candidate_split.left.nuc_muts.is_empty() { + if n_left_muts == 0 { return graph.parent_of(candidate_node); } } else if n_shared_muts > 0 { From d25c973c778362c970129a58c3cab8030e83b48a Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Mon, 11 Sep 2023 18:09:40 +0200 Subject: [PATCH 18/19] feat: account for placement mask, compare nodes based on mut score --- .../nextclade/src/analyze/divergence.rs | 17 +++++++ .../nextclade/src/run/nextclade_wasm.rs | 2 +- packages_rs/nextclade/src/tree/params.rs | 4 ++ .../nextclade/src/tree/tree_builder.rs | 46 +++++++++++-------- 4 files changed, 48 insertions(+), 21 deletions(-) diff --git a/packages_rs/nextclade/src/analyze/divergence.rs b/packages_rs/nextclade/src/analyze/divergence.rs index 381a509abe..d54c2231cc 100644 --- a/packages_rs/nextclade/src/analyze/divergence.rs +++ b/packages_rs/nextclade/src/analyze/divergence.rs @@ -1,4 +1,6 @@ use crate::analyze::nuc_sub::NucSub; +use crate::coord::range::NucRefGlobalRange; +use crate::tree::params::TreeBuilderParams; use crate::tree::tree::DivergenceUnits; /// Calculate number of nuc muts, only considering ACGT characters @@ -24,3 +26,18 @@ pub fn calculate_branch_length( this_div } + +/// Calculate nuc mut score +pub fn score_nuc_muts(nuc_muts: &[NucSub], masked_ranges: &[NucRefGlobalRange], params: &TreeBuilderParams) -> f64 { + // Only consider ACGT characters + let nuc_muts = nuc_muts.iter().filter(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt()); + + // Split away masked mutations + let (muts, masked_muts): (Vec<_>, Vec<_>) = + nuc_muts.partition(|m| masked_ranges.iter().any(|range| range.contains(m.pos))); + + let n_muts = muts.iter().count() as f64; + let n_masked_muts = masked_muts.iter().count() as f64; + + return n_muts + n_masked_muts * params.masked_muts_weight; +} diff --git a/packages_rs/nextclade/src/run/nextclade_wasm.rs b/packages_rs/nextclade/src/run/nextclade_wasm.rs index a8fcbbc77e..0873e8537f 100644 --- a/packages_rs/nextclade/src/run/nextclade_wasm.rs +++ b/packages_rs/nextclade/src/run/nextclade_wasm.rs @@ -257,7 +257,7 @@ impl Nextclade { pub fn get_output_trees(&mut self, results: Vec) -> Result, Report> { if let Some(graph) = &mut self.graph { - graph_attach_new_nodes_in_place(graph, results, self.ref_seq.len(), &self.params.tree_builder)?; + graph_attach_new_nodes_in_place(graph, results, self.ref_seq.len(), &self.params.tree_builder)?; let auspice = convert_graph_to_auspice_tree(graph)?; let nwk = convert_graph_to_nwk_string(graph)?; Ok(Some(OutputTrees { auspice, nwk })) diff --git a/packages_rs/nextclade/src/tree/params.rs b/packages_rs/nextclade/src/tree/params.rs index 4171ee034b..b3b05803bc 100644 --- a/packages_rs/nextclade/src/tree/params.rs +++ b/packages_rs/nextclade/src/tree/params.rs @@ -14,6 +14,9 @@ pub struct TreeBuilderParams { #[clap(long)] #[clap(num_args=0..=1, default_missing_value = "true")] pub without_greedy_tree_builder: bool, + + #[clap(long)] + pub masked_muts_weight: f64, } #[allow(clippy::derivable_impls)] @@ -21,6 +24,7 @@ impl Default for TreeBuilderParams { fn default() -> Self { Self { without_greedy_tree_builder: false, + masked_muts_weight: 0.5, } } } diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index a6e992a3ad..84f74563c8 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -1,9 +1,10 @@ use crate::analyze::aa_del::AaDel; use crate::analyze::aa_sub::AaSub; -use crate::analyze::divergence::{calculate_branch_length, count_nuc_muts}; +use crate::analyze::divergence::{calculate_branch_length, count_nuc_muts, score_nuc_muts}; use crate::analyze::find_private_nuc_mutations::BranchMutations; use crate::analyze::nuc_del::NucDel; use crate::analyze::nuc_sub::NucSub; +use crate::coord::range::NucRefGlobalRange; use crate::graph::node::{GraphNodeKey, Node}; use crate::tree::params::TreeBuilderParams; use crate::tree::split_muts::{difference_of_muts, split_muts, union_of_muts, SplitMutsResult}; @@ -86,7 +87,7 @@ pub fn graph_attach_new_node_in_place( } else { // for the attachment on the reference tree ('result') fine tune the position // on the updated graph to minimize the number of private mutations - finetune_nearest_node(graph, result.nearest_node_id, &mutations_seq)? + finetune_nearest_node(graph, result.nearest_node_id, &mutations_seq, params)? }; // add the new node at the fine tuned position while accounting for shared mutations @@ -106,14 +107,17 @@ pub fn finetune_nearest_node( graph: &AuspiceGraph, nearest_node_key: GraphNodeKey, seq_private_mutations: &BranchMutations, + params: &TreeBuilderParams, ) -> Result<(GraphNodeKey, BranchMutations), Report> { + let masked_ranges = graph.data.meta.placement_mask_ranges(); + let mut best_node = graph.get_node(nearest_node_key)?; let mut private_mutations = seq_private_mutations.clone(); loop { // Check how many mutations are shared with the branch leading to the current_best_node or any of its children - let (candidate_node, candidate_split, n_shared_muts) = find_shared_muts(graph, best_node, &private_mutations) - .wrap_err_with(|| { + let (candidate_node, candidate_split, shared_muts_score) = + find_shared_muts(graph, best_node, &private_mutations, masked_ranges, params).wrap_err_with(|| { format!( "When calculating shared mutations against the current best node '{}'", best_node.payload().name @@ -121,8 +125,8 @@ pub fn finetune_nearest_node( })?; // Check if the new candidate node is better than the current best - let n_left_muts = count_nuc_muts(&candidate_split.left.nuc_muts); - match find_better_node_maybe(graph, best_node, candidate_node, n_shared_muts, n_left_muts) { + let left_muts_score = score_nuc_muts(&candidate_split.left.nuc_muts, masked_ranges, params); + match find_better_node_maybe(graph, best_node, candidate_node, shared_muts_score, left_muts_score) { None => break, Some(better_node) => best_node = better_node, } @@ -144,15 +148,17 @@ fn find_shared_muts<'g>( graph: &'g AuspiceGraph, best_node: &'g Node, private_mutations: &BranchMutations, -) -> Result<(&'g Node, SplitMutsResult, usize), Report> { - let (mut candidate_split, mut n_shared_muts) = if best_node.is_root() { + masked_ranges: &[NucRefGlobalRange], + params: &TreeBuilderParams, +) -> Result<(&'g Node, SplitMutsResult, f64), Report> { + let (mut candidate_split, mut shared_muts_score) = if best_node.is_root() { // Don't include node if node is root as we don't attach nodes above the root let candidate_split = SplitMutsResult { left: BranchMutations::default(), right: private_mutations.clone(), shared: BranchMutations::default(), }; - (candidate_split, 0) + (candidate_split, 0.0) } else { let candidate_split = split_muts(&best_node.payload().tmp.private_mutations.invert(), private_mutations) .wrap_err_with(|| { @@ -161,8 +167,8 @@ fn find_shared_muts<'g>( best_node.payload().name ) })?; - let n_shared_muts = count_nuc_muts(&candidate_split.shared.nuc_muts); - (candidate_split, n_shared_muts) + let shared_muts_score = score_nuc_muts(&candidate_split.shared.nuc_muts, masked_ranges, params); + (candidate_split, shared_muts_score) }; // Check all child nodes for shared mutations @@ -174,14 +180,14 @@ fn find_shared_muts<'g>( child.payload().name ) })?; - let child_n_shared_muts = count_nuc_muts(&child_split.shared.nuc_muts); - if child_n_shared_muts > n_shared_muts { - n_shared_muts = child_n_shared_muts; + let child_shared_muts_score = score_nuc_muts(&candidate_split.shared.nuc_muts, masked_ranges, params); + if child_shared_muts_score > shared_muts_score { + shared_muts_score = child_shared_muts_score; candidate_split = child_split; candidate_node = child; } } - Ok((candidate_node, candidate_split, n_shared_muts)) + Ok((candidate_node, candidate_split, shared_muts_score)) } /// Find out if the candidate node is better than the current best (with caveats). @@ -190,18 +196,18 @@ fn find_better_node_maybe<'g>( graph: &'g AuspiceGraph, best_node: &'g Node, candidate_node: &'g Node, - n_shared_muts: usize, - n_left_muts: usize, + shared_muts_score: f64, + left_muts_score: f64, ) -> Option<&'g Node> { if candidate_node == best_node { // best node is the node itself. Move up the tree if all mutations between // the candidate node and its parent are also in the private mutations. // This covers the case where the candidate is a leaf with zero length branch - // as the .left.nuc_muts is emtpy in that case - if n_left_muts == 0 { + // as the .left.nuc_muts is empty in that case + if left_muts_score == 0.0 { return graph.parent_of(candidate_node); } - } else if n_shared_muts > 0 { + } else if shared_muts_score > 0.0 { // candidate node is child node, move to child node if there are shared mutations // this should always be the case if the candidate node != best_node return Some(candidate_node); From ddd238952d754304821583f4198d18eec83e730f Mon Sep 17 00:00:00 2001 From: Richard Neher Date: Mon, 11 Sep 2023 22:51:17 +0200 Subject: [PATCH 19/19] feat: fix splitting into masked+unmasked and split correct set --- packages_rs/nextclade/src/analyze/divergence.rs | 9 ++++----- packages_rs/nextclade/src/tree/params.rs | 2 +- packages_rs/nextclade/src/tree/tree_builder.rs | 3 +-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/packages_rs/nextclade/src/analyze/divergence.rs b/packages_rs/nextclade/src/analyze/divergence.rs index d54c2231cc..bf389863d9 100644 --- a/packages_rs/nextclade/src/analyze/divergence.rs +++ b/packages_rs/nextclade/src/analyze/divergence.rs @@ -33,11 +33,10 @@ pub fn score_nuc_muts(nuc_muts: &[NucSub], masked_ranges: &[NucRefGlobalRange], let nuc_muts = nuc_muts.iter().filter(|m| m.ref_nuc.is_acgt() && m.qry_nuc.is_acgt()); // Split away masked mutations - let (muts, masked_muts): (Vec<_>, Vec<_>) = + let (masked_muts, muts): (Vec<_>, Vec<_>) = nuc_muts.partition(|m| masked_ranges.iter().any(|range| range.contains(m.pos))); - let n_muts = muts.iter().count() as f64; - let n_masked_muts = masked_muts.iter().count() as f64; - - return n_muts + n_masked_muts * params.masked_muts_weight; + let n_muts = muts.len() as f64; + let n_masked_muts = masked_muts.len() as f64; + n_muts + n_masked_muts * params.masked_muts_weight } diff --git a/packages_rs/nextclade/src/tree/params.rs b/packages_rs/nextclade/src/tree/params.rs index b3b05803bc..d751d2324c 100644 --- a/packages_rs/nextclade/src/tree/params.rs +++ b/packages_rs/nextclade/src/tree/params.rs @@ -24,7 +24,7 @@ impl Default for TreeBuilderParams { fn default() -> Self { Self { without_greedy_tree_builder: false, - masked_muts_weight: 0.5, + masked_muts_weight: 0.05, } } } diff --git a/packages_rs/nextclade/src/tree/tree_builder.rs b/packages_rs/nextclade/src/tree/tree_builder.rs index 84f74563c8..6955ca65ae 100644 --- a/packages_rs/nextclade/src/tree/tree_builder.rs +++ b/packages_rs/nextclade/src/tree/tree_builder.rs @@ -110,7 +110,6 @@ pub fn finetune_nearest_node( params: &TreeBuilderParams, ) -> Result<(GraphNodeKey, BranchMutations), Report> { let masked_ranges = graph.data.meta.placement_mask_ranges(); - let mut best_node = graph.get_node(nearest_node_key)?; let mut private_mutations = seq_private_mutations.clone(); @@ -180,7 +179,7 @@ fn find_shared_muts<'g>( child.payload().name ) })?; - let child_shared_muts_score = score_nuc_muts(&candidate_split.shared.nuc_muts, masked_ranges, params); + let child_shared_muts_score = score_nuc_muts(&child_split.shared.nuc_muts, masked_ranges, params); if child_shared_muts_score > shared_muts_score { shared_muts_score = child_shared_muts_score; candidate_split = child_split;