From 36bc43f588b23dc2fb360181839e373c70c5ce8f Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Tue, 3 Oct 2023 10:32:25 +0200 Subject: [PATCH] fix(cli): prevent sort command from generating duplicate sequences The `nextclade sort` subcommand have been duplicating duplicate output sequences, one for each dataset suggestion, due to invalid calculation of possible dataset name prefixes. Here I fix that. --- .../src/cli/nextclade_seq_sort.rs | 47 +++++++++++-------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs b/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs index 847a3d1c4..3d864617e 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs @@ -187,8 +187,6 @@ fn writer_thread( } for dataset in datasets { - let name = &dataset.name; - results_csv.map_mut_fallible(|results_csv| { results_csv.write(&SeqSortCsvEntry { seq_name: &record.fasta_record.seq_name, @@ -197,24 +195,22 @@ fn writer_thread( num_hits: Some(dataset.n_hits), }) })?; + } - let names = name - .split('/') - .scan(PathBuf::new(), |name, component| { - *name = name.join(component); - Some(name.clone()) - }) - .unique() - .map(path_to_string) - .collect::, Report>>()?; - - for name in names { - let filepath = get_filepath(&name, &template, output_dir)?; - - if let Some(filepath) = filepath { - let writer = get_or_insert_writer(&mut writers, filepath)?; - writer.write(&record.fasta_record.seq_name, &record.fasta_record.seq, false)?; - } + let names = datasets + .iter() + .map(|dataset| get_all_prefix_names(&dataset.name)) + .collect::>, Report>>()? + .into_iter() + .flatten() + .unique(); + + for name in names { + let filepath = get_filepath(&name, &template, output_dir)?; + + if let Some(filepath) = filepath { + let writer = get_or_insert_writer(&mut writers, filepath)?; + writer.write(&record.fasta_record.seq_name, &record.fasta_record.seq, false)?; } } } @@ -224,6 +220,19 @@ fn writer_thread( Ok(()) } +pub fn get_all_prefix_names(name: impl AsRef) -> Result, Report> { + name + .as_ref() + .split('/') + .scan(PathBuf::new(), |name, component| { + *name = name.join(component); + Some(name.clone()) + }) + .unique() + .map(path_to_string) + .collect() +} + struct StatsPrinter { enabled: bool, stats: BTreeMap,