diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs b/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs index d8007488e..d67635e4b 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_cli.rs @@ -12,6 +12,7 @@ use itertools::Itertools; use lazy_static::lazy_static; use nextclade::io::fs::add_extension; use nextclade::run::params::NextcladeInputParamsOptional; +use nextclade::sort::params::NextcladeSeqSortParams; use nextclade::utils::global_init::setup_logger; use nextclade::{getenv, make_error}; use std::fmt::Debug; @@ -671,6 +672,12 @@ pub struct NextcladeSeqSortArgs { #[clap(hide_long_help = true, hide_short_help = true)] pub output_dir: Option, + #[clap(flatten, next_help_heading = " Algorithm")] + pub search_params: NextcladeSeqSortParams, + + #[clap(flatten, next_help_heading = " Other")] + pub other_params: NextcladeRunOtherParams, + /// Use custom dataset server. /// /// You can host your own dataset server, with one or more datasets, grouped into dataset collections, and use this server to provide datasets to users of Nextclade CLI and Nextclade Web. Refer to Nextclade dataset documentation for more details. @@ -681,10 +688,6 @@ pub struct NextcladeSeqSortArgs { #[clap(flatten)] pub proxy_config: ProxyConfig, - - /// Number of processing jobs. If not specified, all available CPU threads will be used. - #[clap(global = false, long, short = 'j', default_value_t = num_cpus::get())] - pub jobs: usize, } fn generate_completions(shell: &str) -> Result<(), Report> { diff --git a/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs b/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs index bb78f8f5b..eb7b1f4f5 100644 --- a/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs +++ b/packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs @@ -1,4 +1,4 @@ -use crate::cli::nextclade_cli::NextcladeSeqSortArgs; +use crate::cli::nextclade_cli::{NextcladeRunOtherParams, NextcladeSeqSortArgs}; use crate::dataset::dataset_download::download_datasets_index_json; use crate::io::http_client::HttpClient; use eyre::{Report, WrapErr}; @@ -8,6 +8,7 @@ use nextclade::io::fasta::{FastaReader, FastaRecord}; use nextclade::make_error; use nextclade::sort::minimizer_index::{MinimizerIndexJson, MINIMIZER_INDEX_ALGO_VERSION}; use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchResult}; +use nextclade::sort::params::NextcladeSeqSortParams; use nextclade::utils::string::truncate; #[derive(Debug, Clone)] @@ -65,7 +66,8 @@ pub fn run(args: &NextcladeSeqSortArgs, minimizer_index: &MinimizerIndexJson) -> let NextcladeSeqSortArgs { input_fastas, output_dir, - jobs, + search_params, + other_params: NextcladeRunOtherParams { jobs }, .. } = args; @@ -100,7 +102,7 @@ pub fn run(args: &NextcladeSeqSortArgs, minimizer_index: &MinimizerIndexJson) -> for fasta_record in &fasta_receiver { info!("Processing sequence '{}'", fasta_record.seq_name); - let result = run_minimizer_search(&fasta_record, minimizer_index) + let result = run_minimizer_search(&fasta_record, minimizer_index, search_params) .wrap_err_with(|| { format!( "When processing sequence #{} '{}'", diff --git a/packages_rs/nextclade/src/sort/minimizer_index.rs b/packages_rs/nextclade/src/sort/minimizer_index.rs index 144ff6262..291b88ce8 100644 --- a/packages_rs/nextclade/src/sort/minimizer_index.rs +++ b/packages_rs/nextclade/src/sort/minimizer_index.rs @@ -25,7 +25,7 @@ pub struct MinimizerIndexJson { pub version: String, - pub params: MinimizerParams, + pub params: MinimizerIndexParams, #[schemars(with = "BTreeMap")] #[serde(serialize_with = "serde_serialize_minimizers")] @@ -67,7 +67,7 @@ pub fn serde_deserialize_minimizers<'de, D: Deserializer<'de>>(deserializer: D) #[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] -pub struct MinimizerParams { +pub struct MinimizerIndexParams { pub k: i64, pub cutoff: i64, diff --git a/packages_rs/nextclade/src/sort/minimizer_search.rs b/packages_rs/nextclade/src/sort/minimizer_search.rs index ce13e45a1..112ca1a08 100644 --- a/packages_rs/nextclade/src/sort/minimizer_search.rs +++ b/packages_rs/nextclade/src/sort/minimizer_search.rs @@ -1,5 +1,6 @@ use crate::io::fasta::FastaRecord; -use crate::sort::minimizer_index::{MinimizerIndexJson, MinimizerParams}; +use crate::sort::minimizer_index::{MinimizerIndexJson, MinimizerIndexParams}; +use crate::sort::params::NextcladeSeqSortParams; use eyre::Report; use itertools::Itertools; use schemars::JsonSchema; @@ -20,6 +21,7 @@ pub struct MinimizerSearchResult { pub fn run_minimizer_search( fasta_record: &FastaRecord, index: &MinimizerIndexJson, + params: &NextcladeSeqSortParams, ) -> Result { let normalization = &index.normalization; let n_refs = index.references.len(); @@ -43,7 +45,7 @@ pub fn run_minimizer_search( // require at least 30% of the maximal hits and at least 10 hits let max_normalized_hit = normalized_hits.iter().copied().fold(0.0, f64::max); let total_hits: u64 = hit_counts.iter().sum(); - if max_normalized_hit < 0.3 || total_hits < 10 { + if max_normalized_hit < params.min_normalized_hit || total_hits < params.min_total_hits { Ok(MinimizerSearchResult { dataset: None, hit_counts, @@ -79,7 +81,7 @@ const fn invertible_hash(x: u64) -> u64 { x } -fn get_hash(kmer: &[u8], params: &MinimizerParams) -> u64 { +fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 { let cutoff = params.cutoff as u64; let mut x = 0; @@ -111,7 +113,7 @@ fn get_hash(kmer: &[u8], params: &MinimizerParams) -> u64 { invertible_hash(x) } -pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerParams) -> Vec { +pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParams) -> Vec { let k = params.k as usize; let cutoff = params.cutoff as u64; diff --git a/packages_rs/nextclade/src/sort/mod.rs b/packages_rs/nextclade/src/sort/mod.rs index 8c3a0809b..c8b4c34db 100644 --- a/packages_rs/nextclade/src/sort/mod.rs +++ b/packages_rs/nextclade/src/sort/mod.rs @@ -1,2 +1,3 @@ pub mod minimizer_index; pub mod minimizer_search; +pub mod params; diff --git a/packages_rs/nextclade/src/sort/params.rs b/packages_rs/nextclade/src/sort/params.rs new file mode 100644 index 000000000..999f9c428 --- /dev/null +++ b/packages_rs/nextclade/src/sort/params.rs @@ -0,0 +1,28 @@ +use clap::Parser; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +#[allow(clippy::struct_excessive_bools)] +#[derive(Parser, Debug, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct NextcladeSeqSortParams { + /// Minimum value of the normalized index hit being considered for assignment + #[clap(long)] + #[clap(default_value_t = NextcladeSeqSortParams::default().min_normalized_hit)] + pub min_normalized_hit: f64, + + /// Minimum number of the index hits required for assignment + #[clap(long)] + #[clap(default_value_t = NextcladeSeqSortParams::default().min_total_hits)] + pub min_total_hits: u64, +} + +#[allow(clippy::derivable_impls)] +impl Default for NextcladeSeqSortParams { + fn default() -> Self { + Self { + min_normalized_hit: 0.3, + min_total_hits: 10, + } + } +}