Skip to content

Commit

Permalink
fix(cli): prevent sort command from generating duplicate sequences
Browse files Browse the repository at this point in the history
The `nextclade sort` subcommand have been duplicating duplicate output sequences, one for each dataset suggestion, due to invalid calculation of possible dataset name prefixes. Here I fix that.
  • Loading branch information
ivan-aksamentov committed Oct 3, 2023
1 parent aaac7ce commit 36bc43f
Showing 1 changed file with 28 additions and 19 deletions.
47 changes: 28 additions & 19 deletions packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,6 @@ fn writer_thread(
}

for dataset in datasets {
let name = &dataset.name;

results_csv.map_mut_fallible(|results_csv| {
results_csv.write(&SeqSortCsvEntry {
seq_name: &record.fasta_record.seq_name,
Expand All @@ -197,24 +195,22 @@ fn writer_thread(
num_hits: Some(dataset.n_hits),
})
})?;
}

let names = name
.split('/')
.scan(PathBuf::new(), |name, component| {
*name = name.join(component);
Some(name.clone())
})
.unique()
.map(path_to_string)
.collect::<Result<Vec<String>, Report>>()?;

for name in names {
let filepath = get_filepath(&name, &template, output_dir)?;

if let Some(filepath) = filepath {
let writer = get_or_insert_writer(&mut writers, filepath)?;
writer.write(&record.fasta_record.seq_name, &record.fasta_record.seq, false)?;
}
let names = datasets
.iter()
.map(|dataset| get_all_prefix_names(&dataset.name))
.collect::<Result<Vec<Vec<String>>, Report>>()?
.into_iter()
.flatten()
.unique();

for name in names {
let filepath = get_filepath(&name, &template, output_dir)?;

if let Some(filepath) = filepath {
let writer = get_or_insert_writer(&mut writers, filepath)?;
writer.write(&record.fasta_record.seq_name, &record.fasta_record.seq, false)?;
}
}
}
Expand All @@ -224,6 +220,19 @@ fn writer_thread(
Ok(())
}

pub fn get_all_prefix_names(name: impl AsRef<str>) -> Result<Vec<String>, Report> {
name
.as_ref()
.split('/')
.scan(PathBuf::new(), |name, component| {
*name = name.join(component);
Some(name.clone())
})
.unique()
.map(path_to_string)
.collect()
}

struct StatsPrinter {
enabled: bool,
stats: BTreeMap<String, usize>,
Expand Down

0 comments on commit 36bc43f

Please sign in to comment.