From 4581a12275a4bc35599dd5fcf545cee0048c27a7 Mon Sep 17 00:00:00 2001
From: ivan-aksamentov <ivan.aksamentov@gmail.com>
Date: Mon, 7 Oct 2024 17:54:26 +0200
Subject: [PATCH] fix: vote for split node's clade to prevent mismatch

After the query node placement is adjusted during the [greedy tree building](https://docs.nextstrain.org/projects/nextclade/en/stable/user/algorithm/03-phylogenetic-placement.html#tree-building), sometimes the branch needs to be split and a new internal node inserted.

Currently we copy the clade of this internal node from the attachment target node. However, this is not always correct and can lead to mismatch between clade of the query node and of the new internal node.

Here I add a voting mechanism (simply a mode) between clades involved: of the parent, target and query nodes.
---
 packages/nextclade/src/tree/tree_builder.rs | 26 ++++++++++++++++++++-
 packages/nextclade/src/utils/mod.rs         |  1 +
 packages/nextclade/src/utils/stats.rs       | 13 +++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 packages/nextclade/src/utils/stats.rs
diff --git a/packages/nextclade/src/tree/tree_builder.rs b/packages/nextclade/src/tree/tree_builder.rs
index f9a50fd9d..1d05bad84 100644
--- a/packages/nextclade/src/tree/tree_builder.rs
+++ b/packages/nextclade/src/tree/tree_builder.rs
@@ -9,11 +9,14 @@ use crate::graph::node::{GraphNodeKey, Node};
 use crate::io::json::{json_stringify, JsonPretty};
 use crate::tree::params::TreeBuilderParams;
 use crate::tree::split_muts::{difference_of_muts, split_muts, union_of_muts, SplitMutsResult};
-use crate::tree::tree::{AuspiceGraph, AuspiceGraphEdgePayload, AuspiceGraphNodePayload, TreeBranchAttrsLabels};
+use crate::tree::tree::{
+  AuspiceGraph, AuspiceGraphEdgePayload, AuspiceGraphNodePayload, TreeBranchAttrsLabels, TreeNodeAttr,
+};
 use crate::tree::tree_attach_new_nodes::create_new_auspice_node;
 use crate::tree::tree_preprocess::add_auspice_metadata_in_place;
 use crate::types::outputs::NextcladeOutputs;
 use crate::utils::collections::concat_to_vec;
+use crate::utils::stats::mode;
 use eyre::{Report, WrapErr};
 use itertools::Itertools;
 use serde_json::json;
@@ -474,6 +477,9 @@ pub fn knit_into_graph(
       }
       set_branch_attrs_aa_labels(&mut new_internal_node);
 
+      // Vote for the most plausible clade
+      new_internal_node.node_attrs.clade_membership = vote_for_clade(graph, target_node, result);
+
       new_internal_node.name = {
         let qry_name = &result.seq_name;
         let qry_index = &result.index;
@@ -536,3 +542,21 @@ fn set_branch_attrs_aa_labels(node: &mut AuspiceGraphNodePayload) {
     });
   }
 }
+
+// Vote for the most plausible clade for the new internal node
+fn vote_for_clade(
+  graph: &AuspiceGraph,
+  target_node: &Node<AuspiceGraphNodePayload>,
+  result: &NextcladeOutputs,
+) -> Option<TreeNodeAttr> {
+  let query_clade = &result.clade;
+
+  let parent_node = &graph.parent_of(target_node);
+  let parent_clade = &parent_node.and_then(|node| node.payload().clade());
+  // let sibling_clades = graph.iter_children_of(&parent_node).map(|child| child.payload().clade());
+
+  let target_clade = &target_node.payload().clade();
+
+  let possible_clades = [parent_clade, query_clade, target_clade].into_iter().flatten(); // exclude None
+  mode(possible_clades).map(|c| TreeNodeAttr::new(c))
+}
diff --git a/packages/nextclade/src/utils/mod.rs b/packages/nextclade/src/utils/mod.rs
index 1b6f01f18..276f73aa8 100644
--- a/packages/nextclade/src/utils/mod.rs
+++ b/packages/nextclade/src/utils/mod.rs
@@ -10,6 +10,7 @@ pub mod info;
 pub mod map;
 pub mod num;
 pub mod option;
+pub mod stats;
 pub mod string;
 pub mod vec2d;
 pub mod wraparound;
diff --git a/packages/nextclade/src/utils/stats.rs b/packages/nextclade/src/utils/stats.rs
new file mode 100644
index 000000000..35b82db0c
--- /dev/null
+++ b/packages/nextclade/src/utils/stats.rs
@@ -0,0 +1,13 @@
+use itertools::Itertools;
+use std::hash::Hash;
+
+/// Calculate mode (the most frequently occurring element) of an iterator.
+/// In case of a tie, the first occurrence is returned. Returns `None` if the iterator is empty.
+pub fn mode<T: Hash + Eq + Clone>(items: impl IntoIterator<Item = T>) -> Option<T> {
+  items
+    .into_iter()
+    .counts()
+    .into_iter()
+    .max_by_key(|&(_, count)| count)
+    .map(|(item, _)| item)
+}