diff --git a/Cargo.lock b/Cargo.lock index e1e983e0..40766010 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -808,10 +808,11 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ + "once_cell", "wasm-bindgen", ] @@ -1772,7 +1773,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.18.0" -source = "git+https://github.com/sourmash-bio/sourmash/?branch=latest#150967b60d712b62cc70060a5d47e744887ce4dc" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=latest#f4f5187e7dc9b9c177e099bbf7f3f42556867328" dependencies = [ "az", "byteorder", @@ -2115,9 +2116,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -2126,13 +2127,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn 2.0.87", @@ -2141,9 +2141,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2151,9 +2151,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", @@ -2164,15 +2164,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.95" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "web-sys" -version = "0.3.72" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 284824d6..6716dca2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,18 +9,18 @@ name = "sourmash_plugin_branchwater" crate-type = ["cdylib"] [dependencies] -pyo3 = { version = "0.23.2", features = ["extension-module", "anyhow"] } +pyo3 = { version = "0.23.3", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.216", features = ["derive"] } #sourmash = { version = "0.17.2", features = ["branchwater"] } -sourmash = {git = "https://github.com/sourmash-bio/sourmash/", branch = "latest", features = ["branchwater"]} +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "latest", features = ["branchwater"] } serde_json = "1.0.133" niffler = "2.4.0" log = "0.4.22" env_logger = { version = "0.11.5" } simple-error = "0.3.1" anyhow = "1.0.94" -zip = { version = "=2.0", default-features = false } +zip = { version = "2.0", default-features = false } tempfile = "3.14" needletail = "0.5.1" csv = "1.3.1" diff --git a/doc/README.md b/doc/README.md index 6da3b286..744348fb 100644 --- a/doc/README.md +++ b/doc/README.md @@ -29,13 +29,11 @@ be processed differently. The plugin commands are also a bit less user friendly, because (for now) we're more focused on speed than polish and user experience. -**Note:** As of v0.9.5, the outputs of `fastgather` and `fastmultigather` almost completely match the output of `sourmash gather`; see below for details. - ## Input file formats sourmash supports a variety of different storage formats for sketches (see [sourmash docs](https://sourmash.readthedocs.io/en/latest/command-line.html#choosing-signature-output-formats)), and the branchwater plugin works with some (but not all) of them. Branchwater _also_ supports an additional database type, a RocksDB-based inverted index, that is not (yet) supported natively by sourmash (through v4.8.11). -**As of v0.9.8, we recommend using zip files or standalone manifest CSVs pointing to zip files whenever you need to provide multiple sketches.** +**We recommend using zip files or standalone manifest CSVs pointing to zip files whenever you need to provide multiple sketches.** | command | command input | database format | | -------- | -------- | -------- | @@ -58,8 +56,8 @@ When working with large collections of small sketches such as genomes, we sugges * in particular, _single_ sketches can be loaded on demand, supporting lower memory requirements for certain kinds of searches. For all these reasons, zip files are the most efficient and effective -basic storage type for sketches in sourmash, and as of the branchwater -plugin v0.9.0, they are fully supported! +basic storage type for sketches in sourmash, and the branchwater +plugin fully supports them! You can create zipfiles with sourmash like so: ``` @@ -152,7 +150,7 @@ at the start in order to generate a manifest. To avoid memory issues, the signatures are not kept in memory, but instead re-loaded as described below for each command (see: Notes on concurrency and efficiency). This makes using pathlists less efficient than `zip` -files (as of v0.9.0) or manifests (as of v0.9.8). +files. ## Running the commands @@ -304,7 +302,7 @@ version of `sourmash gather`. sourmash scripts fastgather query.sig.gz database.zip -o results.csv --cores 4 ``` -As of v0.9.5, `fastgather` outputs the same columns as `sourmash gather`, with only a few exception +`fastgather` outputs the same columns as `sourmash gather`, with only a few exception * `match_name` is output instead of `name`; * `match_md5` is output instead of `md5`; * `match_filename` is output instead of `filename`, and the value is different; @@ -392,6 +390,11 @@ To report _any_ overlap between two sketches, set the threshold to 0. (This will produce many, many results when searching a collection of metagenomes!) +Using `-A/--output-all-comparisons` will ignore the threshold parameter +and output all comparisons done. Against a RocksDB database, only matches +with some overlap will be reported; with collections of sketches, all +pairs will be reported. + By default, `manysearch` will display the contents of the CSV file in a human-readable format. This can be disabled with `-N/--no-pretty-print` when executing large searches. @@ -452,14 +455,14 @@ pathlist format, and specify the desired output directory; we suggest using the `.rocksdb` extension for RocksDB databases, e.g. `-o gtdb-rs214-k31.rocksdb`. -By default, as of v0.9.7, `index` will store a copy of the sketches +By default, `index` will store a copy of the sketches along with the inverted index. This will substantially increase the disk space required for large databases. You can provide an optional `--no-internal-storage` to `index` to store them externally, which reduces the disk space needed for the index. Read below for technical details! -As of v0.9.8, `index` can take any of the supported input types, but +`index` can take any of the supported input types, but unless you are using a zip file or a pathlist of JSON files, it may need to load all the sketches into memory before indexing them. Moreover, you can only use external storage with a zip file. We @@ -470,9 +473,6 @@ the sketches are being loaded into memory. #### Internal vs external storage of sketches in a RocksDB index -(The below applies to v0.9.7 and later of the plugin; for v0.9.6 and -before, only external storage was implemented.) - RocksDB indexes support containment queries (a la the [branchwater application](https://github.com/sourmash-bio/branchwater)), as well as `gather`-style mixture decomposition (see @@ -489,7 +489,7 @@ the original source sketches used to construct the database, wherever they reside on your disk. The sketches *are not used* by `manysearch`, but *are used* by -`fastmultigather`: with v0.9.6 and later, you'll get an error if you +`fastmultigather`: you'll get an error if you run `fastmultigather` against a RocksDB index where the sketches cannot be loaded. @@ -521,6 +521,21 @@ in downstream software packages (this plugin, and [the branchwater application code](https://github.com/sourmash-bio/branchwater)). The above documentation applies to sourmash core v0.15.0. +## Notes on versioning and semantic versioning guarantees + +Unlike sourmash, +[which provides guarantees that command-line options and outputs will not change within minor versions](https://sourmash.readthedocs.io/en/latest/support.html#versioning-and-stability-of-features-and-apis), +we make no guarantees of stability within the branchwater plugin. This +is because the branchwater plugin is intended to move fast and +occasionally break things. + +Eventually we expect to provide all of the branchwater plugin's functionality within the sourmash package, at which time the sourmash guarantees will apply! + +However, we do not expect command line options and output file formats +to change quickly. + +We will also endeavor to avoid changing column names in CSV output, although, we may change the _order_ of column names on occasion. Please use the column headers (column names) to select specific columns. + ## Notes on concurrency and efficiency Each command does things somewhat differently, with implications for diff --git a/src/lib.rs b/src/lib.rs index afa5b857..8226a633 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,7 +28,7 @@ mod singlesketch; use camino::Utf8PathBuf as PathBuf; #[pyfunction] -#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, output_path=None, ignore_abundance=false))] +#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, output_path=None, ignore_abundance=false, output_all_comparisons=false))] #[allow(clippy::too_many_arguments)] fn do_manysearch( querylist_path: String, @@ -39,6 +39,7 @@ fn do_manysearch( moltype: String, output_path: Option, ignore_abundance: Option, + output_all_comparisons: Option, ) -> anyhow::Result { let againstfile_path: PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); @@ -46,6 +47,7 @@ fn do_manysearch( let allow_failed_sigpaths = true; let ignore_abundance = ignore_abundance.unwrap_or(false); + let output_all_comparisons = output_all_comparisons.unwrap_or(false); // if siglist_path is revindex, run rocksdb manysearch; otherwise run manysearch if is_revindex_database(&againstfile_path) { @@ -57,6 +59,7 @@ fn do_manysearch( threshold, output_path, allow_failed_sigpaths, + output_all_comparisons, ) { Ok(_) => Ok(0), Err(e) => { @@ -73,6 +76,7 @@ fn do_manysearch( output_path, allow_failed_sigpaths, ignore_abundance, + output_all_comparisons, ) { Ok(_) => Ok(0), Err(e) => { @@ -232,7 +236,7 @@ fn do_check(index: String, quick: bool) -> anyhow::Result { } #[pyfunction] -#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, estimate_ani, estimate_prob_overlap, output_path=None))] +#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, estimate_ani, estimate_prob_overlap, output_all_comparisons, output_path=None))] #[allow(clippy::too_many_arguments)] fn do_multisearch( querylist_path: String, @@ -243,6 +247,7 @@ fn do_multisearch( moltype: String, estimate_ani: bool, estimate_prob_overlap: bool, + output_all_comparisons: bool, output_path: Option, ) -> anyhow::Result { let _ = env_logger::try_init(); @@ -258,6 +263,7 @@ fn do_multisearch( allow_failed_sigpaths, estimate_ani, estimate_prob_overlap, + output_all_comparisons, output_path, ) { Ok(_) => Ok(0), @@ -270,7 +276,7 @@ fn do_multisearch( #[pyfunction] #[allow(clippy::too_many_arguments)] -#[pyo3(signature = (siglist_path, threshold, ksize, scaled, moltype, estimate_ani, write_all, output_path=None))] +#[pyo3(signature = (siglist_path, threshold, ksize, scaled, moltype, estimate_ani, write_all, output_all_comparisons, output_path=None))] fn do_pairwise( siglist_path: String, threshold: f64, @@ -279,6 +285,7 @@ fn do_pairwise( moltype: String, estimate_ani: bool, write_all: bool, + output_all_comparisons: bool, output_path: Option, ) -> anyhow::Result { let selection = build_selection(ksize, scaled, &moltype); @@ -290,6 +297,7 @@ fn do_pairwise( allow_failed_sigpaths, estimate_ani, write_all, + output_all_comparisons, output_path, ) { Ok(_) => Ok(0), diff --git a/src/manysearch.rs b/src/manysearch.rs index 968f1548..d5d629ca 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -9,7 +9,9 @@ use stats::{median, stddev}; use std::sync::atomic; use std::sync::atomic::AtomicUsize; -use crate::utils::{csvwriter_thread, load_collection, ReportType, SearchResult}; +use crate::utils::{ + csvwriter_thread, load_collection, ManySearchResult, ReportType, SmallSignature, +}; use sourmash::ani_utils::ani_from_containment; use sourmash::errors::SourmashError; use sourmash::selection::Selection; @@ -25,6 +27,7 @@ pub fn manysearch( output: Option, allow_failed_sigpaths: bool, ignore_abundance: bool, + output_all_comparisons: bool, ) -> Result<()> { // Load query collection let query_collection = load_collection( @@ -61,7 +64,8 @@ pub fn manysearch( )?; // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output let thrd = csvwriter_thread(recv, output); @@ -95,80 +99,18 @@ pub fn manysearch( >::try_into(against_sig) { for query in query_sketchlist.iter() { - // be paranoid and confirm scaled match. - if query.minhash.scaled() != common_scaled { - panic!("different query scaled"); - } - if against_mh.scaled() != common_scaled { - panic!("different against scaled"); - } - - let overlap = query - .minhash - .count_common(&against_mh, false) - .expect("incompatible sketches") - as f64; - - let query_size = query.minhash.size() as f64; - let containment_query_in_target = overlap / query_size; - // only calculate results if we have shared hashes - if containment_query_in_target > threshold { - let target_size = against_mh.size() as f64; - let containment_target_in_query = overlap / target_size; - - let max_containment = - containment_query_in_target.max(containment_target_in_query); - let jaccard = overlap / (target_size + query_size - overlap); - - let qani = ani_from_containment( - containment_query_in_target, - against_mh.ksize() as f64, - ); - let mani = ani_from_containment( - containment_target_in_query, - against_mh.ksize() as f64, - ); - let query_containment_ani = Some(qani); - let match_containment_ani = Some(mani); - let average_containment_ani = Some((qani + mani) / 2.); - let max_containment_ani = Some(f64::max(qani, mani)); - - let calc_abund_stats = - against_mh.track_abundance() && !ignore_abundance; - let ( - total_weighted_hashes, - n_weighted_found, - average_abund, - median_abund, - std_abund, - ) = if calc_abund_stats { - inflate_abundances(&query.minhash, &against_mh).ok()? - } else { - (None, None, None, None, None) - }; - - results.push(SearchResult { - query_name: query.name.clone(), - query_md5: query.md5sum.clone(), - match_name: against_name.clone(), - containment: containment_query_in_target, - intersect_hashes: overlap as u64, - ksize: query.minhash.ksize() as u16, - scaled: query.minhash.scaled(), - moltype: query.minhash.hash_function().to_string(), - match_md5: Some(against_md5.clone()), - jaccard: Some(jaccard), - max_containment: Some(max_containment), - average_abund, - median_abund, - std_abund, - query_containment_ani, - match_containment_ani, - average_containment_ani, - max_containment_ani, - n_weighted_found, - total_weighted_hashes, - }); + let sr = calculate_manysearch_result( + query, + &against_mh, + &against_name, + &against_md5, + threshold, + common_scaled, + ignore_abundance, + output_all_comparisons, + ); + if let Some(sr) = sr { + results.push(sr); } } } else { @@ -226,6 +168,9 @@ pub fn manysearch( Ok(()) } +// inflate_abundances: "borrow" the abundances from 'against' onto the +// intersection with 'query'. + fn inflate_abundances( query: &KmerMinHash, against: &KmerMinHash, @@ -241,10 +186,9 @@ fn inflate_abundances( > { let abunds: Vec; let sum_weighted: u64; - let sum_all_abunds: u64; + let sum_all_abunds: u64 = against.sum_abunds(); (abunds, sum_weighted) = query.inflated_abundances(against)?; - sum_all_abunds = against.sum_abunds(); let average_abund = sum_weighted as f64 / abunds.len() as f64; let median_abund = median(abunds.iter().cloned()).expect("error"); @@ -258,3 +202,81 @@ fn inflate_abundances( Some(std_abund), )) } + +// calculate_manysearch_result: calculate all the things + +fn calculate_manysearch_result( + query: &SmallSignature, + against_mh: &KmerMinHash, + against_name: &str, + against_md5: &str, + threshold: f64, + common_scaled: u32, + ignore_abundance: bool, + output_all_comparisons: bool, +) -> Option { + // be paranoid and confirm scaled match. + if query.minhash.scaled() != common_scaled { + panic!("different query scaled"); + } + if against_mh.scaled() != common_scaled { + panic!("different against scaled"); + } + + let overlap = query + .minhash + .count_common(against_mh, false) + .expect("incompatible sketches") as f64; + + let query_size = query.minhash.size() as f64; + let containment_query_in_target = overlap / query_size; + + // only calculate results if we have shared hashes + if containment_query_in_target > threshold || output_all_comparisons { + let target_size = against_mh.size() as f64; + let containment_target_in_query = overlap / target_size; + + let max_containment = containment_query_in_target.max(containment_target_in_query); + let jaccard = overlap / (target_size + query_size - overlap); + + let qani = ani_from_containment(containment_query_in_target, against_mh.ksize() as f64); + let mani = ani_from_containment(containment_target_in_query, against_mh.ksize() as f64); + let query_containment_ani = Some(qani); + let match_containment_ani = Some(mani); + let average_containment_ani = Some((qani + mani) / 2.); + let max_containment_ani = Some(f64::max(qani, mani)); + + let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; + let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = + if calc_abund_stats { + inflate_abundances(&query.minhash, against_mh).ok()? + } else { + (None, None, None, None, None) + }; + + let sr = ManySearchResult { + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against_name.to_string(), + containment: containment_query_in_target, + intersect_hashes: overlap as u64, + ksize: query.minhash.ksize() as u16, + scaled: query.minhash.scaled(), + moltype: query.minhash.hash_function().to_string(), + match_md5: Some(against_md5.to_string()), + jaccard: Some(jaccard), + max_containment: Some(max_containment), + average_abund, + median_abund, + std_abund, + query_containment_ani, + match_containment_ani, + average_containment_ani, + max_containment_ani, + n_weighted_found, + total_weighted_hashes, + }; + return Some(sr); + } + None +} diff --git a/src/manysearch_rocksdb.rs b/src/manysearch_rocksdb.rs index 30f75dd5..85fd8542 100644 --- a/src/manysearch_rocksdb.rs +++ b/src/manysearch_rocksdb.rs @@ -14,7 +14,7 @@ use sourmash::sketch::minhash::KmerMinHash; use sourmash::storage::SigStore; use crate::utils::{ - csvwriter_thread, is_revindex_database, load_collection, ReportType, SearchResult, + csvwriter_thread, is_revindex_database, load_collection, ManySearchResult, ReportType, }; pub fn manysearch_rocksdb( @@ -24,6 +24,7 @@ pub fn manysearch_rocksdb( minimum_containment: f64, output: Option, allow_failed_sigpaths: bool, + output_all_comparisons: bool, ) -> Result<(), Box> { if !is_revindex_database(&index) { bail!("'{}' is not a valid RevIndex database", index); @@ -68,7 +69,8 @@ pub fn manysearch_rocksdb( )?; // set up a multi-producer, single-consumer channel. - let (send, recv) = std::sync::mpsc::sync_channel::(rayon::current_num_threads()); + let (send, recv) = + std::sync::mpsc::sync_channel::(rayon::current_num_threads()); // & spawn a thread that is dedicated to printing to a buffered output let thrd = csvwriter_thread(recv, output); @@ -107,13 +109,13 @@ pub fn manysearch_rocksdb( // filter the matches for containment for (path, overlap) in matches { let containment = overlap as f64 / query_size as f64; - if containment >= minimum_containment { + if containment >= minimum_containment || output_all_comparisons { let query_containment_ani = Some(ani_from_containment( containment, query_mh.ksize() as f64, )); - results.push(SearchResult { + results.push(ManySearchResult { query_name: query_name.clone(), query_md5: query_md5.clone(), match_name: path.clone(), diff --git a/src/multisearch.rs b/src/multisearch.rs index 0aabba36..fb030a2b 100644 --- a/src/multisearch.rs +++ b/src/multisearch.rs @@ -143,6 +143,7 @@ pub fn multisearch( allow_failed_sigpaths: bool, estimate_ani: bool, estimate_prob_overlap: bool, + output_all_comparisons: bool, output: Option, ) -> Result<(), Box> { // Load all queries into memory at once. @@ -249,7 +250,7 @@ pub fn multisearch( let containment_query_in_target = overlap / query_size; - if containment_query_in_target > threshold { + if containment_query_in_target > threshold || output_all_comparisons { let containment_target_in_query = overlap / target_size; let max_containment = containment_query_in_target.max(containment_target_in_query); diff --git a/src/pairwise.rs b/src/pairwise.rs index b9e1a3b3..f5d2d362 100644 --- a/src/pairwise.rs +++ b/src/pairwise.rs @@ -20,6 +20,7 @@ pub fn pairwise( allow_failed_sigpaths: bool, estimate_ani: bool, write_all: bool, + output_all_comparisons: bool, output: Option, ) -> Result<(), Box> { // Load all sigs into memory at once. @@ -86,7 +87,10 @@ pub fn pairwise( let containment_adjusted_log10 = None; let tf_idf_score = None; - if containment_q1_in_q2 > threshold || containment_q2_in_q1 > threshold { + if containment_q1_in_q2 > threshold + || containment_q2_in_q1 > threshold + || output_all_comparisons + { let max_containment = containment_q1_in_q2.max(containment_q2_in_q1); let jaccard = overlap / (query1_size + query2_size - overlap); let mut query_containment_ani = None; @@ -133,7 +137,7 @@ pub fn pairwise( eprintln!("Processed {} comparisons", i); } } - if write_all { + if write_all || output_all_comparisons { let mut query_containment_ani = None; let mut match_containment_ani = None; let mut average_containment_ani = None; diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 97dc1880..7e3c2d46 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -107,6 +107,12 @@ def __init__(self, p): action="store_true", help="do not do expensive abundance calculations", ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons; against a RocksDB database, this will only output comparisons with some overlap", + ) def main(self, args): print_version() @@ -129,6 +135,7 @@ def main(self, args): args.moltype, args.output, args.ignore_abundance, + args.output_all_comparisons, ) if status == 0: notify(f"...manysearch is done! results in '{args.output}'") @@ -462,6 +469,12 @@ def __init__(self, p): action="store_true", help="estimate probability of overlap for significance ranking of search results, of the specific query and match, given all queries and possible matches", ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons", + ) def main(self, args): print_version() @@ -488,6 +501,7 @@ def main(self, args): args.moltype, args.ani, args.prob_significant_overlap, + args.output_all_comparisons, args.output, ) if status == 0: @@ -545,9 +559,16 @@ def __init__(self, p): ) p.add_argument( "--write-all", + "--write-self-comparisons", action="store_true", help="write self comparisons for all sketches", ) + p.add_argument( + "-A", + "--output-all-comparisons", + action="store_true", + help="ignore threshold and output all comparisons", + ) def main(self, args): print_version() @@ -570,6 +591,7 @@ def main(self, args): args.moltype, args.ani, args.write_all, + args.output_all_comparisons, args.output, ) if status == 0: diff --git a/src/python/tests/test_fastgather.py b/src/python/tests/test_fastgather.py index 7442a645..c3be996c 100644 --- a/src/python/tests/test_fastgather.py +++ b/src/python/tests/test_fastgather.py @@ -427,40 +427,6 @@ def test_bad_against_2(runtmp, capfd): ) -def test_bad_against_3(runtmp, capfd): - # test with a bad against (a .sig.gz file renamed as zip file) - query = get_test_data("SRR606249.sig.gz") - - sig2 = get_test_data("2.fa.sig.gz") - against_zip = runtmp.output("against.zip") - # cp sig2 into against_zip - with open(against_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - g_output = runtmp.output("gather.csv") - p_output = runtmp.output("prefetch.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash( - "scripts", - "fastgather", - query, - against_zip, - "-o", - g_output, - "--output-prefetch", - p_output, - "-s", - "100000", - ) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - @pytest.mark.xfail(reason="should work, bug") def test_against_multisigfile(runtmp, zip_against): # test against a sigfile that contains multiple sketches diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index 9d9c17e7..d47f62ad 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -561,34 +561,6 @@ def test_sig_query(runtmp, capfd, indexed): }.issubset(keys) -def test_bad_query(runtmp, capfd, indexed): - # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output("against.txt") - - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - make_file_list(against_list, [sig2, sig47, sig63]) - - if indexed: - against_list = index_siglist(runtmp, against_list, runtmp.output("db")) - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "fastmultigather", query_zip, against_list) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_query(runtmp, capfd, indexed): # test missing query query_list = runtmp.output("query.txt") @@ -736,33 +708,6 @@ def test_bad_against(runtmp, capfd): ) -def test_bad_against_2(runtmp, capfd, zip_query): - # test with a bad against (a .sig.gz file renamed as zip file) - query = get_test_data("SRR606249.sig.gz") - query_list = runtmp.output("query.txt") - make_file_list(query_list, [query]) - - sig2 = get_test_data("2.fa.sig.gz") - against_zip = runtmp.output("against.zip") - # cp sig2 into query_zip - with open(against_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - if zip_query: - query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash( - "scripts", "fastmultigather", query_list, against_zip, "-s", "100000" - ) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_empty_against(runtmp, capfd): # test bad 'against' file - in this case, an empty one query = get_test_data("SRR606249.sig.gz") diff --git a/src/python/tests/test_index.py b/src/python/tests/test_index.py index baefe060..ff0305b2 100644 --- a/src/python/tests/test_index.py +++ b/src/python/tests/test_index.py @@ -429,27 +429,6 @@ def test_index_zipfile_multiparam(runtmp, capfd, toggle_internal_storage): runtmp.sourmash("scripts", "index", zipf, "-o", output, toggle_internal_storage) -def test_index_zipfile_bad(runtmp, capfd): - # test with a bad input zipfile (a .sig.gz file renamed as zip file) - sig2 = get_test_data("2.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "index", query_zip, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "Couldn't find End Of Central Directory Record" in captured.err - - def test_index_check(runtmp, toggle_internal_storage): # test check index siglist = runtmp.output("db-sigs.txt") diff --git a/src/python/tests/test_manysearch.py b/src/python/tests/test_manysearch.py index 6275b0cf..c41070f3 100644 --- a/src/python/tests/test_manysearch.py +++ b/src/python/tests/test_manysearch.py @@ -113,6 +113,116 @@ def test_simple(runtmp, zip_query, zip_against): assert max_ani == 0.9772 +def test_simple_output_all(runtmp, zip_query, zip_against): + # test basic execution! + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") + + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + + make_file_list(query_list, [sig2, sig47, sig63]) + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output("out.csv") + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + if zip_against: + against_list = zip_siglist(runtmp, against_list, runtmp.output("against.zip")) + + runtmp.sourmash( + "scripts", + "manysearch", + query_list, + against_list, + "-o", + output, + "-t", + "0.01", + "-A", + ) + assert os.path.exists(output) + + df = pandas.read_csv(output) + assert len(df) == 9 + + dd = df.to_dict(orient="index") + print(dd) + + for idx, row in dd.items(): + # identical? + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert float(row["query_containment_ani"] == 1.0) + assert float(row["match_containment_ani"] == 1.0) + assert float(row["average_containment_ani"] == 1.0) + assert float(row["max_containment_ani"] == 1.0) + + else: + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float(row["containment"]) + jaccard = float(row["jaccard"]) + maxcont = float(row["max_containment"]) + intersect_hashes = int(row["intersect_hashes"]) + query_ani = float(row["query_containment_ani"]) + match_ani = float(row["match_containment_ani"]) + average_ani = float(row["average_containment_ani"]) + max_ani = float(row["max_containment_ani"]) + jaccard = round(jaccard, 4) + cont = round(cont, 4) + maxcont = round(maxcont, 4) + query_ani = round(query_ani, 4) + match_ani = round(match_ani, 4) + average_ani = round(average_ani, 4) + max_ani = round(max_ani, 4) + print( + q, + m, + f"{jaccard:.04}", + f"{cont:.04}", + f"{maxcont:.04}", + f"{query_ani:.04}", + f"{match_ani:.04}", + f"{average_ani:.04}", + f"{max_ani:.04}", + ) + + if q == "NC_011665.1" and m == "NC_009661.1": + assert jaccard == 0.3207 + assert cont == 0.4828 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + assert query_ani == 0.9768 + assert match_ani == 0.9772 + assert average_ani == 0.977 + assert max_ani == 0.9772 + elif q == "NC_009661.1" and m == "NC_011665.1": + assert jaccard == 0.3207 + assert cont == 0.4885 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + assert query_ani == 0.9772 + assert match_ani == 0.9768 + assert average_ani == 0.977 + assert max_ani == 0.9772 + else: + assert jaccard == 0 + assert cont == 0 + assert maxcont == 0 + assert intersect_hashes == 0 + assert query_ani == 0 + assert match_ani == 0 + assert average_ani == 0 + assert max_ani == 0 + + def test_simple_abund(runtmp): # test with abund sig sig2 = get_test_data("2.fa.sig.gz") @@ -556,33 +666,6 @@ def test_bad_query_2(runtmp, capfd, indexed): ) -def test_bad_query_3(runtmp, capfd): - # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output("against.txt") - - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - make_file_list(against_list, [sig2, sig47, sig63]) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "multisearch", query_zip, against_list, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_against(runtmp, capfd, indexed): # test with a missing against list query_list = runtmp.output("query.txt") diff --git a/src/python/tests/test_multisearch.py b/src/python/tests/test_multisearch.py index dfc65ee2..5b78223d 100644 --- a/src/python/tests/test_multisearch.py +++ b/src/python/tests/test_multisearch.py @@ -91,6 +91,80 @@ def test_simple_no_ani(runtmp, zip_query, zip_db): assert intersect_hashes == 2529 +def test_simple_no_ani_output_all(runtmp, zip_query, zip_db): + # test basic execution! + query_list = runtmp.output("query.txt") + against_list = runtmp.output("against.txt") + + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + + make_file_list(query_list, [sig2, sig47, sig63]) + make_file_list(against_list, [sig2, sig47, sig63]) + + output = runtmp.output("out.csv") + + if zip_db: + against_list = zip_siglist(runtmp, against_list, runtmp.output("db.zip")) + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + + runtmp.sourmash( + "scripts", "multisearch", query_list, against_list, "-o", output, "-A" + ) + assert os.path.exists(output) + + df = pandas.read_csv(output) + assert len(df) == 9 + + dd = df.to_dict(orient="index") + print(dd) + + for idx, row in dd.items(): + assert not ("prob_overlap" in row) + + # identical? + if row["match_name"] == row["query_name"]: + assert row["query_md5"] == row["match_md5"], row + assert float(row["containment"] == 1.0) + assert float(row["jaccard"] == 1.0) + assert float(row["max_containment"] == 1.0) + assert "query_containment_ani" not in row + assert "match_containment_ani" not in row + assert "average_containment_ani" not in row + assert "max_containment_ani" not in row + + else: + # confirm hand-checked numbers + q = row["query_name"].split()[0] + m = row["match_name"].split()[0] + cont = float_round(row["containment"], 4) + jaccard = float_round(row["jaccard"], 4) + maxcont = float_round(row["max_containment"], 4) + + intersect_hashes = int(row["intersect_hashes"]) + + print(q, m, f"{jaccard:.04}", f"{cont:.04}", f"{maxcont:.04}") + + if q == "NC_011665.1" and m == "NC_009661.1": + assert jaccard == 0.3207 + assert cont == 0.4828 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + + elif q == "NC_009661.1" and m == "NC_011665.1": + assert jaccard == 0.3207 + assert cont == 0.4885 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + else: + assert jaccard == 0 + assert cont == 0 + assert maxcont == 0 + assert intersect_hashes == 0 + + def test_simple_prob_overlap(runtmp, zip_query, zip_db, indexed_query, indexed_against): # test basic execution! query_list = runtmp.output("query.txt") @@ -628,33 +702,6 @@ def test_bad_query(runtmp, capfd): ) -def test_bad_query_3(runtmp, capfd): - # test with a bad query (a .sig.gz file renamed as zip file) - against_list = runtmp.output("against.txt") - - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - make_file_list(against_list, [sig2, sig47, sig63]) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "multisearch", query_zip, against_list, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_against(runtmp, capfd, zip_db): # test with a missing against list query_list = runtmp.output("query.txt") diff --git a/src/python/tests/test_pairwise.py b/src/python/tests/test_pairwise.py index 1a940043..fd6f7a86 100644 --- a/src/python/tests/test_pairwise.py +++ b/src/python/tests/test_pairwise.py @@ -20,6 +20,17 @@ def test_installed(runtmp): assert "usage: pairwise" in runtmp.last_result.err +def test_on_dir(runtmp, capfd): + with pytest.raises(utils.SourmashCommandFailed): + runtmp.sourmash( + "scripts", "pairwise", runtmp.output(""), "-o", runtmp.output("xxx.csv") + ) + + captured = capfd.readouterr() + print(captured.err) + assert "arbitrary directories are not supported" in captured.err + + def test_simple_no_ani(runtmp, capfd, zip_query, indexed): # test basic execution! query_list = runtmp.output("query.txt") @@ -80,6 +91,31 @@ def test_simple_no_ani(runtmp, capfd, zip_query, indexed): ) +def test_simple_no_ani_output_all(runtmp, capfd, zip_query, indexed): + # test basic execution! + query_list = runtmp.output("query.txt") + + sig2 = get_test_data("2.fa.sig.gz") + sig47 = get_test_data("47.fa.sig.gz") + sig63 = get_test_data("63.fa.sig.gz") + + make_file_list(query_list, [sig2, sig47, sig63]) + + output = runtmp.output("out.csv") + + if zip_query: + query_list = zip_siglist(runtmp, query_list, runtmp.output("query.zip")) + + if indexed: + query_list = index_siglist(runtmp, query_list, runtmp.output("db")) + + runtmp.sourmash("scripts", "pairwise", query_list, "-o", output, "-t", "-1", "-A") + assert os.path.exists(output) + + df = pandas.read_csv(output) + assert len(df) == 6 + + def test_simple_ani(runtmp, zip_query): # test basic execution! query_list = runtmp.output("query.txt") @@ -147,6 +183,26 @@ def test_simple_ani(runtmp, zip_query): assert q2_ani == 0.9772 assert avg_ani == 0.977 assert max_ani == 0.9772 + elif m == "NC_011665.1" and q == "NC_009661.1": + assert jaccard == 0.3207 + assert cont == 0.4885 + assert maxcont == 0.4885 + assert intersect_hashes == 2529 + assert q2_ani == 0.9768 + assert q1_ani == 0.9772 + assert avg_ani == 0.977 + assert max_ani == 0.9772 + elif q == m: + assert jaccard == 1 + else: + assert jaccard == 0 + assert cont == 0 + assert maxcont == 0 + assert intersect_hashes == 0 + assert q1_ani == 0 + assert q2_ani == 0 + assert avg_ani == 0 + assert max_ani == 0 def test_simple_threshold(runtmp, zip_query): @@ -230,29 +286,6 @@ def test_bad_query(runtmp, capfd): ) -def test_bad_query_2(runtmp, capfd): - # test with a bad query (a .sig.gz file renamed as zip file) - sig2 = get_test_data("2.fa.sig.gz") - sig47 = get_test_data("47.fa.sig.gz") - sig63 = get_test_data("63.fa.sig.gz") - - query_zip = runtmp.output("query.zip") - # cp sig2 into query_zip - with open(query_zip, "wb") as fp: - with open(sig2, "rb") as fp2: - fp.write(fp2.read()) - - output = runtmp.output("out.csv") - - with pytest.raises(utils.SourmashCommandFailed): - runtmp.sourmash("scripts", "pairwise", query_zip, "-o", output) - - captured = capfd.readouterr() - print(captured.err) - - assert "InvalidArchive" in captured.err - - def test_missing_query(runtmp, capfd, zip_db): # test with a missing query list query_list = runtmp.output("query.txt") diff --git a/src/utils/mod.rs b/src/utils/mod.rs index b8335b83..3c3e82f8 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -14,7 +14,7 @@ use serde::{Deserialize, Serialize}; // use rust_decimal::{MathematicalOps, Decimal}; use std::cmp::{Ordering, PartialOrd}; use std::collections::BinaryHeap; -use std::fs::{create_dir_all, File}; +use std::fs::{create_dir_all, metadata, File}; use std::io::{BufWriter, Write}; use std::panic; use std::sync::atomic; @@ -33,7 +33,7 @@ use std::collections::{HashMap, HashSet}; use std::hash::{Hash, Hasher}; pub mod multicollection; -use multicollection::MultiCollection; +pub use multicollection::{MultiCollection, SmallSignature}; pub mod buildutils; use buildutils::{BuildCollection, BuildManifest}; @@ -555,6 +555,14 @@ pub fn load_collection( } }); + // we support RocksDB directory paths, but nothing else, unlike sourmash. + if collection.is_none() { + let path_metadata = metadata(sigpath.clone()).expect("getting path metadata failed"); + if path_metadata.is_dir() { + bail!("arbitrary directories are not supported as input"); + } + } + let collection = collection.or_else( || match MultiCollection::from_standalone_manifest(&sigpath) { @@ -617,7 +625,7 @@ pub fn load_collection( /// /// # Arguments /// -/// * `sketchlist` - A slice of loaded `SmallSignature` sketches. +/// * `collection` - A MultiCollection. /// * `skipped_paths` - # paths that contained no compatible sketches. /// * `failed_paths` - # paths that failed to load. /// * `report_type` - ReportType Enum (Query or Against). Used to specify @@ -1001,7 +1009,7 @@ pub fn is_revindex_database(path: &camino::Utf8PathBuf) -> bool { } #[derive(Serialize)] -pub struct SearchResult { +pub struct ManySearchResult { pub query_name: String, pub query_md5: String, pub match_name: String,