Skip to content

Commit

Permalink
MRG: choose default scaled for multiple collections based on first co…
Browse files Browse the repository at this point in the history
…llection (#488)

* initial fix

* cleanup

* add tests

* add rocksdb index test

* set scaled from max_scaled in collection

* take ownership of selection; fix mistake

* remove clone

* remove debug print

* revert to released sourmash

* cargo fmt

* apply black
  • Loading branch information
ctb authored Nov 1, 2024
1 parent 31d3056 commit 0c92e38
Show file tree
Hide file tree
Showing 16 changed files with 246 additions and 57 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash_plugin_branchwater"
version = "0.9.8"
version = "0.9.9-dev"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
8 changes: 4 additions & 4 deletions src/fastgather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ pub fn fastgather(
against_filepath: String,
threshold_bp: usize,
scaled: usize,
selection: &Selection,
selection: Selection,
gather_output: Option<String>,
prefetch_output: Option<String>,
allow_failed_sigpaths: bool,
) -> Result<()> {
let query_collection = load_collection(
&query_filepath,
selection,
&selection,
ReportType::Query,
allow_failed_sigpaths,
)?;
Expand All @@ -40,7 +40,7 @@ pub fn fastgather(
let query_md5 = query_sig.md5sum();

// clone here is necessary b/c we use full query_sig in consume_query_by_gather
let query_sig_ds = query_sig.select(selection)?; // downsample
let query_sig_ds = query_sig.select(&selection)?; // downsample
let query_mh = match query_sig_ds.try_into() {
Ok(query_mh) => query_mh,
Err(_) => {
Expand All @@ -50,7 +50,7 @@ pub fn fastgather(
// load collection to match against.
let against_collection = load_collection(
&against_filepath,
selection,
&selection,
ReportType::Against,
allow_failed_sigpaths,
)?;
Expand Down
25 changes: 20 additions & 5 deletions src/fastmultigather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ pub fn fastmultigather(
query_filepath: String,
against_filepath: String,
threshold_bp: usize,
scaled: usize,
selection: &Selection,
scaled: Option<usize>,
selection: Selection,
allow_failed_sigpaths: bool,
save_matches: bool,
create_empty_results: bool,
Expand All @@ -40,11 +40,26 @@ pub fn fastmultigather(
// load query collection
let query_collection = load_collection(
&query_filepath,
selection,
&selection,
ReportType::Query,
allow_failed_sigpaths,
)?;

let scaled = match scaled {
Some(s) => s,
None => {
let scaled = query_collection.max_scaled().expect("no records!?").clone() as usize;
eprintln!(
"Setting scaled={} based on max scaled in query collection",
scaled
);
scaled
}
};

let mut against_selection = selection;
against_selection.set_scaled(scaled as u32);

let threshold_hashes: u64 = {
let x = threshold_bp / scaled;
if x > 0 {
Expand All @@ -60,12 +75,12 @@ pub fn fastmultigather(
// load against collection
let against_collection = load_collection(
&against_filepath,
selection,
&against_selection,
ReportType::Against,
allow_failed_sigpaths,
)?;
// load against sketches into memory, downsampling on the way
let against = against_collection.load_sketches(selection)?;
let against = against_collection.load_sketches(&against_selection)?;

// Iterate over all queries => do prefetch and gather!
let processed_queries = AtomicUsize::new(0);
Expand Down
8 changes: 4 additions & 4 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use sourmash::collection::{Collection, CollectionSet};

pub fn index<P: AsRef<Path>>(
siglist: String,
selection: &Selection,
selection: Selection,
output: P,
colors: bool,
allow_failed_sigpaths: bool,
Expand All @@ -18,7 +18,7 @@ pub fn index<P: AsRef<Path>>(

let multi = match load_collection(
&siglist,
selection,
&selection,
ReportType::General,
allow_failed_sigpaths,
) {
Expand All @@ -31,15 +31,15 @@ pub fn index<P: AsRef<Path>>(
let collection = match Collection::try_from(multi.clone()) {
// conversion worked!
Ok(c) => {
let cs: CollectionSet = c.select(selection)?.try_into()?;
let cs: CollectionSet = c.select(&selection)?.try_into()?;
Ok(cs)
}
// conversion failed; can we/should we load it into memory?
Err(_) => {
if use_internal_storage {
eprintln!("WARNING: loading all sketches into memory in order to index.");
eprintln!("See 'index' documentation for details.");
let c: Collection = multi.load_all_sigs(selection)?;
let c: Collection = multi.load_all_sigs(&selection)?;
let cs: CollectionSet = c.try_into()?;
Ok(cs)
} else {
Expand Down
28 changes: 14 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ fn do_manysearch(
ignore_abundance: Option<bool>,
) -> anyhow::Result<u8> {
let againstfile_path: PathBuf = siglist_path.clone().into();
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(ksize, Some(scaled), &moltype);
eprintln!("selection scaled: {:?}", selection.scaled());
let allow_failed_sigpaths = true;

Expand All @@ -51,7 +51,7 @@ fn do_manysearch(
match mastiff_manysearch::mastiff_manysearch(
querylist_path,
againstfile_path,
&selection,
selection,
threshold,
output_path,
allow_failed_sigpaths,
Expand All @@ -66,7 +66,7 @@ fn do_manysearch(
match manysearch::manysearch(
querylist_path,
siglist_path,
&selection,
selection,
threshold,
output_path,
allow_failed_sigpaths,
Expand Down Expand Up @@ -94,15 +94,15 @@ fn do_fastgather(
output_path_prefetch: Option<String>,
output_path_gather: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(ksize, Some(scaled), &moltype);
let allow_failed_sigpaths = true;

match fastgather::fastgather(
query_filename,
siglist_path,
threshold_bp,
scaled,
&selection,
selection,
output_path_prefetch,
output_path_gather,
allow_failed_sigpaths,
Expand All @@ -123,7 +123,7 @@ fn do_fastmultigather(
siglist_path: String,
threshold_bp: usize,
ksize: u8,
scaled: usize,
scaled: Option<usize>,
moltype: String,
output_path: Option<String>,
save_matches: bool,
Expand All @@ -138,7 +138,7 @@ fn do_fastmultigather(
match mastiff_manygather::mastiff_manygather(
query_filenames,
againstfile_path,
&selection,
selection.clone(),
threshold_bp,
output_path,
allow_failed_sigpaths,
Expand All @@ -158,7 +158,7 @@ fn do_fastmultigather(
siglist_path,
threshold_bp,
scaled,
&selection,
selection,
allow_failed_sigpaths,
save_matches,
create_empty_results,
Expand Down Expand Up @@ -199,11 +199,11 @@ fn do_index(
colors: bool,
use_internal_storage: bool,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(ksize, Some(scaled), &moltype);
let allow_failed_sigpaths = false;
match index::index(
siglist,
&selection,
selection,
output,
colors,
allow_failed_sigpaths,
Expand Down Expand Up @@ -237,7 +237,7 @@ fn do_multisearch(
siglist_path: String,
threshold: f64,
ksize: u8,
scaled: usize,
scaled: Option<usize>,
moltype: String,
estimate_ani: bool,
output_path: Option<String>,
Expand All @@ -251,7 +251,7 @@ fn do_multisearch(
querylist_path,
siglist_path,
threshold,
&selection,
selection,
allow_failed_sigpaths,
estimate_ani,
output_path,
Expand All @@ -277,12 +277,12 @@ fn do_pairwise(
write_all: bool,
output_path: Option<String>,
) -> anyhow::Result<u8> {
let selection = build_selection(ksize, scaled, &moltype);
let selection = build_selection(ksize, Some(scaled), &moltype);
let allow_failed_sigpaths = true;
match pairwise::pairwise(
siglist_path,
threshold,
&selection,
selection,
allow_failed_sigpaths,
estimate_ani,
write_all,
Expand Down
8 changes: 4 additions & 4 deletions src/manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use sourmash::sketch::minhash::KmerMinHash;
pub fn manysearch(
query_filepath: String,
against_filepath: String,
selection: &Selection,
selection: Selection,
threshold: f64,
output: Option<String>,
allow_failed_sigpaths: bool,
Expand All @@ -28,18 +28,18 @@ pub fn manysearch(
// Load query collection
let query_collection = load_collection(
&query_filepath,
selection,
&selection,
ReportType::Query,
allow_failed_sigpaths,
)?;

// load all query sketches into memory, downsampling on the way
let query_sketchlist = query_collection.load_sketches(selection)?;
let query_sketchlist = query_collection.load_sketches(&selection)?;

// Against: Load collection, potentially off disk & not into memory.
let against_collection = load_collection(
&against_filepath,
selection,
&selection,
ReportType::Against,
allow_failed_sigpaths,
)?;
Expand Down
4 changes: 2 additions & 2 deletions src/mastiff_manygather.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::utils::{
pub fn mastiff_manygather(
queries_file: String,
index: PathBuf,
selection: &Selection,
selection: Selection,
threshold_bp: usize,
output: Option<String>,
allow_failed_sigpaths: bool,
Expand All @@ -29,7 +29,7 @@ pub fn mastiff_manygather(

let query_collection = load_collection(
&queries_file,
selection,
&selection,
ReportType::Query,
allow_failed_sigpaths,
)?;
Expand Down
4 changes: 2 additions & 2 deletions src/mastiff_manysearch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use crate::utils::{
pub fn mastiff_manysearch(
queries_path: String,
index: PathBuf,
selection: &Selection,
selection: Selection,
minimum_containment: f64,
output: Option<String>,
allow_failed_sigpaths: bool,
Expand All @@ -35,7 +35,7 @@ pub fn mastiff_manysearch(
// Load query paths
let query_collection = load_collection(
&queries_path,
selection,
&selection,
ReportType::Query,
allow_failed_sigpaths,
)?;
Expand Down
Loading

0 comments on commit 0c92e38

Please sign in to comment.