Skip to content

Commit

Permalink
feat(cli): add cli params for minimizer search algo
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov committed Sep 1, 2023
1 parent d327c77 commit 8940bbb
Show file tree
Hide file tree
Showing 6 changed files with 49 additions and 13 deletions.
11 changes: 7 additions & 4 deletions packages_rs/nextclade-cli/src/cli/nextclade_cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use itertools::Itertools;
use lazy_static::lazy_static;
use nextclade::io::fs::add_extension;
use nextclade::run::params::NextcladeInputParamsOptional;
use nextclade::sort::params::NextcladeSeqSortParams;
use nextclade::utils::global_init::setup_logger;
use nextclade::{getenv, make_error};
use std::fmt::Debug;
Expand Down Expand Up @@ -671,6 +672,12 @@ pub struct NextcladeSeqSortArgs {
#[clap(hide_long_help = true, hide_short_help = true)]
pub output_dir: Option<PathBuf>,

#[clap(flatten, next_help_heading = " Algorithm")]
pub search_params: NextcladeSeqSortParams,

#[clap(flatten, next_help_heading = " Other")]
pub other_params: NextcladeRunOtherParams,

/// Use custom dataset server.
///
/// You can host your own dataset server, with one or more datasets, grouped into dataset collections, and use this server to provide datasets to users of Nextclade CLI and Nextclade Web. Refer to Nextclade dataset documentation for more details.
Expand All @@ -681,10 +688,6 @@ pub struct NextcladeSeqSortArgs {

#[clap(flatten)]
pub proxy_config: ProxyConfig,

/// Number of processing jobs. If not specified, all available CPU threads will be used.
#[clap(global = false, long, short = 'j', default_value_t = num_cpus::get())]
pub jobs: usize,
}

fn generate_completions(shell: &str) -> Result<(), Report> {
Expand Down
8 changes: 5 additions & 3 deletions packages_rs/nextclade-cli/src/cli/nextclade_seq_sort.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::cli::nextclade_cli::NextcladeSeqSortArgs;
use crate::cli::nextclade_cli::{NextcladeRunOtherParams, NextcladeSeqSortArgs};
use crate::dataset::dataset_download::download_datasets_index_json;
use crate::io::http_client::HttpClient;
use eyre::{Report, WrapErr};
Expand All @@ -8,6 +8,7 @@ use nextclade::io::fasta::{FastaReader, FastaRecord};
use nextclade::make_error;
use nextclade::sort::minimizer_index::{MinimizerIndexJson, MINIMIZER_INDEX_ALGO_VERSION};
use nextclade::sort::minimizer_search::{run_minimizer_search, MinimizerSearchResult};
use nextclade::sort::params::NextcladeSeqSortParams;
use nextclade::utils::string::truncate;

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -65,7 +66,8 @@ pub fn run(args: &NextcladeSeqSortArgs, minimizer_index: &MinimizerIndexJson) ->
let NextcladeSeqSortArgs {
input_fastas,
output_dir,
jobs,
search_params,
other_params: NextcladeRunOtherParams { jobs },
..
} = args;

Expand Down Expand Up @@ -100,7 +102,7 @@ pub fn run(args: &NextcladeSeqSortArgs, minimizer_index: &MinimizerIndexJson) ->
for fasta_record in &fasta_receiver {
info!("Processing sequence '{}'", fasta_record.seq_name);

let result = run_minimizer_search(&fasta_record, minimizer_index)
let result = run_minimizer_search(&fasta_record, minimizer_index, search_params)
.wrap_err_with(|| {
format!(
"When processing sequence #{} '{}'",
Expand Down
4 changes: 2 additions & 2 deletions packages_rs/nextclade/src/sort/minimizer_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub struct MinimizerIndexJson {

pub version: String,

pub params: MinimizerParams,
pub params: MinimizerIndexParams,

#[schemars(with = "BTreeMap<String, String>")]
#[serde(serialize_with = "serde_serialize_minimizers")]
Expand Down Expand Up @@ -67,7 +67,7 @@ pub fn serde_deserialize_minimizers<'de, D: Deserializer<'de>>(deserializer: D)

#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct MinimizerParams {
pub struct MinimizerIndexParams {
pub k: i64,

pub cutoff: i64,
Expand Down
10 changes: 6 additions & 4 deletions packages_rs/nextclade/src/sort/minimizer_search.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::io::fasta::FastaRecord;
use crate::sort::minimizer_index::{MinimizerIndexJson, MinimizerParams};
use crate::sort::minimizer_index::{MinimizerIndexJson, MinimizerIndexParams};
use crate::sort::params::NextcladeSeqSortParams;
use eyre::Report;
use itertools::Itertools;
use schemars::JsonSchema;
Expand All @@ -20,6 +21,7 @@ pub struct MinimizerSearchResult {
pub fn run_minimizer_search(
fasta_record: &FastaRecord,
index: &MinimizerIndexJson,
params: &NextcladeSeqSortParams,
) -> Result<MinimizerSearchResult, Report> {
let normalization = &index.normalization;
let n_refs = index.references.len();
Expand All @@ -43,7 +45,7 @@ pub fn run_minimizer_search(
// require at least 30% of the maximal hits and at least 10 hits
let max_normalized_hit = normalized_hits.iter().copied().fold(0.0, f64::max);
let total_hits: u64 = hit_counts.iter().sum();
if max_normalized_hit < 0.3 || total_hits < 10 {
if max_normalized_hit < params.min_normalized_hit || total_hits < params.min_total_hits {
Ok(MinimizerSearchResult {
dataset: None,
hit_counts,
Expand Down Expand Up @@ -79,7 +81,7 @@ const fn invertible_hash(x: u64) -> u64 {
x
}

fn get_hash(kmer: &[u8], params: &MinimizerParams) -> u64 {
fn get_hash(kmer: &[u8], params: &MinimizerIndexParams) -> u64 {
let cutoff = params.cutoff as u64;

let mut x = 0;
Expand Down Expand Up @@ -111,7 +113,7 @@ fn get_hash(kmer: &[u8], params: &MinimizerParams) -> u64 {
invertible_hash(x)
}

pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerParams) -> Vec<u64> {
pub fn get_ref_search_minimizers(seq: &FastaRecord, params: &MinimizerIndexParams) -> Vec<u64> {
let k = params.k as usize;
let cutoff = params.cutoff as u64;

Expand Down
1 change: 1 addition & 0 deletions packages_rs/nextclade/src/sort/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod minimizer_index;
pub mod minimizer_search;
pub mod params;
28 changes: 28 additions & 0 deletions packages_rs/nextclade/src/sort/params.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
use clap::Parser;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

#[allow(clippy::struct_excessive_bools)]
#[derive(Parser, Debug, Clone, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "camelCase")]
pub struct NextcladeSeqSortParams {
/// Minimum value of the normalized index hit being considered for assignment
#[clap(long)]
#[clap(default_value_t = NextcladeSeqSortParams::default().min_normalized_hit)]
pub min_normalized_hit: f64,

/// Minimum number of the index hits required for assignment
#[clap(long)]
#[clap(default_value_t = NextcladeSeqSortParams::default().min_total_hits)]
pub min_total_hits: u64,
}

#[allow(clippy::derivable_impls)]
impl Default for NextcladeSeqSortParams {
fn default() -> Self {
Self {
min_normalized_hit: 0.3,
min_total_hits: 10,
}
}
}

0 comments on commit 8940bbb

Please sign in to comment.