From 2cc44e026bc147a292a1dcac3ab2bcd046be09fa Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 5 Nov 2024 11:36:26 -0800 Subject: [PATCH] MRG: standardize on u32 for scaled, and introduce `ScaledType` (#3364) Makes `scaled` a u32, and change a few others as well. This PR started because we were mixing u32 and u64 in places, but I think a switch away from usize (architecture specific & mostly returned from collection lengths, and so on) to explicit u32/u64 seems good. Fixes https://github.com/sourmash-bio/sourmash/issues/3363 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Luiz Irber --- Cargo.lock | 2 +- include/sourmash.h | 10 +- src/core/Cargo.toml | 2 +- src/core/src/ani_utils.rs | 4 +- src/core/src/cmd.rs | 4 +- src/core/src/ffi/cmd/compute.rs | 5 +- src/core/src/ffi/index/revindex.rs | 7 +- src/core/src/ffi/minhash.rs | 2 +- src/core/src/index/linear.rs | 9 +- src/core/src/index/mod.rs | 29 +-- src/core/src/index/revindex/disk_revindex.rs | 4 +- src/core/src/lib.rs | 1 + src/core/src/manifest.rs | 6 +- src/core/src/selection.rs | 8 +- src/core/src/signature.rs | 6 +- src/core/src/sketch/minhash.rs | 22 +-- src/core/src/wasm.rs | 9 +- src/core/tests/minhash.rs | 3 +- tests/test_minhash.py | 197 +++++++++---------- tests/test_signature.py | 8 +- 20 files changed, 169 insertions(+), 169 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 58856f4c4e..0d35a5a679 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1657,7 +1657,7 @@ checksum = "9f1341053f34bb13b5e9590afb7d94b48b48d4b87467ec28e3c238693bb553de" [[package]] name = "sourmash" -version = "0.16.0" +version = "0.17.0" dependencies = [ "az", "byteorder", diff --git a/include/sourmash.h b/include/sourmash.h index 65585519dd..beb64ea455 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -68,6 +68,8 @@ typedef struct SourmashSignature SourmashSignature; typedef struct SourmashZipStorage SourmashZipStorage; +typedef uint32_t ScaledType; + /** * Represents a string. */ @@ -104,7 +106,7 @@ uint32_t computeparams_num_hashes(const SourmashComputeParameters *ptr); bool computeparams_protein(const SourmashComputeParameters *ptr); -uint64_t computeparams_scaled(const SourmashComputeParameters *ptr); +ScaledType computeparams_scaled(const SourmashComputeParameters *ptr); uint64_t computeparams_seed(const SourmashComputeParameters *ptr); @@ -122,7 +124,7 @@ void computeparams_set_num_hashes(SourmashComputeParameters *ptr, uint32_t num); void computeparams_set_protein(SourmashComputeParameters *ptr, bool v); -void computeparams_set_scaled(SourmashComputeParameters *ptr, uint64_t scaled); +void computeparams_set_scaled(SourmashComputeParameters *ptr, uint32_t scaled); void computeparams_set_seed(SourmashComputeParameters *ptr, uint64_t new_seed); @@ -230,7 +232,7 @@ SourmashStr kmerminhash_md5sum(const SourmashKmerMinHash *ptr); void kmerminhash_merge(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other); -SourmashKmerMinHash *kmerminhash_new(uint64_t scaled, +SourmashKmerMinHash *kmerminhash_new(uint32_t scaled, uint32_t k, HashFunctions hash_function, uint64_t seed, @@ -342,7 +344,7 @@ SourmashRevIndex *revindex_new_with_sigs(const SourmashSignature *const *search_ const SourmashKmerMinHash *const *queries_ptr, uintptr_t inqueries); -uint64_t revindex_scaled(const SourmashRevIndex *ptr); +ScaledType revindex_scaled(const SourmashRevIndex *ptr); const SourmashSearchResult *const *revindex_search(const SourmashRevIndex *ptr, const SourmashSignature *sig_ptr, diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 8775b2af88..ad588cb22b 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "sourmash" -version = "0.16.0" +version = "0.17.0" authors = ["Luiz Irber ", "N. Tessa Pierce-Ward "] description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" diff --git a/src/core/src/ani_utils.rs b/src/core/src/ani_utils.rs index 073cf6050b..6886c481ed 100644 --- a/src/core/src/ani_utils.rs +++ b/src/core/src/ani_utils.rs @@ -5,7 +5,7 @@ use roots::{find_root_brent, SimpleConvergency}; use statrs::distribution::{ContinuousCDF, Normal}; -use crate::Error; +use crate::{Error, ScaledType}; fn exp_n_mutated(l: f64, k: f64, r1: f64) -> f64 { let q = r1_to_q(k, r1); @@ -93,7 +93,7 @@ pub fn ani_from_containment(containment: f64, ksize: f64) -> f64 { pub fn ani_ci_from_containment( containment: f64, ksize: f64, - scaled: u64, + scaled: ScaledType, n_unique_kmers: u64, confidence: Option, ) -> Result<(f64, f64), Error> { diff --git a/src/core/src/cmd.rs b/src/core/src/cmd.rs index a760e0f79d..b59e99b912 100644 --- a/src/core/src/cmd.rs +++ b/src/core/src/cmd.rs @@ -47,8 +47,8 @@ pub struct ComputeParameters { singleton: bool, #[getset(get_copy = "pub", set = "pub")] - #[builder(default = 0u64)] - scaled: u64, + #[builder(default = 0u32)] + scaled: u32, #[getset(get_copy = "pub", set = "pub")] #[builder(default = false)] diff --git a/src/core/src/ffi/cmd/compute.rs b/src/core/src/ffi/cmd/compute.rs index a3aa294234..4cecd456e2 100644 --- a/src/core/src/ffi/cmd/compute.rs +++ b/src/core/src/ffi/cmd/compute.rs @@ -1,6 +1,7 @@ use std::slice; use crate::cmd::ComputeParameters; +use crate::ScaledType; use crate::ffi::utils::ForeignObject; @@ -155,7 +156,7 @@ pub unsafe extern "C" fn computeparams_set_num_hashes( } #[no_mangle] -pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> u64 { +pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> ScaledType { let cp = SourmashComputeParameters::as_rust(ptr); cp.scaled() } @@ -163,7 +164,7 @@ pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParamet #[no_mangle] pub unsafe extern "C" fn computeparams_set_scaled( ptr: *mut SourmashComputeParameters, - scaled: u64, + scaled: u32, ) { let cp = SourmashComputeParameters::as_rust_mut(ptr); cp.set_scaled(scaled); diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs index e38bdef7fb..ef0e328139 100644 --- a/src/core/src/ffi/index/revindex.rs +++ b/src/core/src/ffi/index/revindex.rs @@ -12,6 +12,7 @@ use crate::prelude::*; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; +use crate::ScaledType; pub struct SourmashRevIndex; @@ -22,8 +23,8 @@ impl ForeignObject for SourmashRevIndex { // TODO: remove this when it is possible to pass Selection thru the FFI fn from_template(template: &Sketch) -> Selection { let (num, scaled) = match template { - Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32), - Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32), + Sketch::MinHash(mh) => (mh.num(), mh.scaled()), + Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled()), _ => unimplemented!(), }; @@ -233,7 +234,7 @@ unsafe fn revindex_gather( } #[no_mangle] -pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> u64 { +pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> ScaledType { let revindex = SourmashRevIndex::as_rust(ptr); if let Sketch::MinHash(mh) = revindex.template() { mh.scaled() diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index 7d7c050fb0..55879dd060 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -17,7 +17,7 @@ impl ForeignObject for SourmashKmerMinHash { #[no_mangle] pub unsafe extern "C" fn kmerminhash_new( - scaled: u64, + scaled: u32, k: u32, hash_function: HashFunctions, seed: u64, diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index f0690539b9..489e69c86e 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -151,7 +151,8 @@ impl LinearIndex { .internal_location() .into(); let match_sig = self.collection.sig_for_dataset(dataset_id)?; - let result = self.stats_for_match(match_sig, query, match_size, match_path, round)?; + let result = + self.stats_for_match(match_sig, query, match_size, match_path, round as u32)?; Ok(result) } @@ -161,7 +162,7 @@ impl LinearIndex { query: &KmerMinHash, match_size: usize, match_path: PathBuf, - gather_result_rank: usize, + gather_result_rank: u32, ) -> Result { let template = self.template(); @@ -176,10 +177,10 @@ impl LinearIndex { let f_match = match_size as f64 / match_mh.size() as f64; let filename = match_path.into_string(); let name = match_sig.name(); - let unique_intersect_bp = match_mh.scaled() as usize * match_size; + let unique_intersect_bp = (match_mh.scaled() as usize * match_size) as u64; let (intersect_orig, _) = match_mh.intersection_size(query)?; - let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; + let intersect_bp: u64 = match_mh.scaled() as u64 * intersect_orig; let f_unique_to_query = intersect_orig as f64 / query.size() as f64; let match_ = match_sig; diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 58330235ee..0bd9d9fec8 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -33,7 +33,7 @@ use crate::Result; #[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)] pub struct GatherResult { #[getset(get_copy = "pub")] - intersect_bp: usize, + intersect_bp: u64, #[getset(get_copy = "pub")] f_orig_query: f64, @@ -72,22 +72,22 @@ pub struct GatherResult { f_match_orig: f64, #[getset(get_copy = "pub")] - unique_intersect_bp: usize, + unique_intersect_bp: u64, #[getset(get_copy = "pub")] - gather_result_rank: usize, + gather_result_rank: u32, #[getset(get_copy = "pub")] - remaining_bp: usize, + remaining_bp: u64, #[getset(get_copy = "pub")] - n_unique_weighted_found: usize, + n_unique_weighted_found: u64, #[getset(get_copy = "pub")] - total_weighted_hashes: usize, + total_weighted_hashes: u64, #[getset(get_copy = "pub")] - sum_weighted_found: usize, + sum_weighted_found: u64, #[getset(get_copy = "pub")] query_containment_ani: f64, @@ -212,9 +212,9 @@ pub fn calculate_gather_stats( remaining_query: KmerMinHash, match_sig: SigStore, match_size: usize, - gather_result_rank: usize, - sum_weighted_found: usize, - total_weighted_hashes: usize, + gather_result_rank: u32, + sum_weighted_found: u64, + total_weighted_hashes: u64, calc_abund_stats: bool, calc_ani_ci: bool, confidence: Option, @@ -242,17 +242,18 @@ pub fn calculate_gather_stats( trace!("query.size: {}", remaining_query.size()); //bp remaining in subtracted query - let remaining_bp = (remaining_query.size() - isect_size) * remaining_query.scaled() as usize; + let remaining_bp = + (remaining_query.size() - isect_size) as u64 * remaining_query.scaled() as u64; // stats for this match vs original query let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap(); - let intersect_bp = (match_mh.scaled() * intersect_orig) as usize; + let intersect_bp = match_mh.scaled() as u64 * intersect_orig; let f_orig_query = intersect_orig as f64 / orig_query.size() as f64; let f_match_orig = intersect_orig as f64 / match_mh.size() as f64; // stats for this match vs current (subtracted) query let f_match = match_size as f64 / match_mh.size() as f64; - let unique_intersect_bp = match_mh.scaled() as usize * isect_size; + let unique_intersect_bp = match_mh.scaled() as u64 * isect_size as u64; let f_unique_to_query = isect_size as f64 / orig_query.size() as f64; // // get ANI values @@ -309,7 +310,7 @@ pub fn calculate_gather_stats( } }; - n_unique_weighted_found = unique_weighted_found as usize; + n_unique_weighted_found = unique_weighted_found; sum_total_weighted_found = sum_weighted_found + n_unique_weighted_found; f_unique_weighted = n_unique_weighted_found as f64 / total_weighted_hashes as f64; diff --git a/src/core/src/index/revindex/disk_revindex.rs b/src/core/src/index/revindex/disk_revindex.rs index 0d575310aa..46552c2c67 100644 --- a/src/core/src/index/revindex/disk_revindex.rs +++ b/src/core/src/index/revindex/disk_revindex.rs @@ -411,7 +411,7 @@ impl RevIndexOps for RevIndex { let query_mh = KmerMinHash::from(query.clone()); // just calculate essentials here - let gather_result_rank = matches.len(); + let gather_result_rank = matches.len() as u32; // grab the specific intersection: // Calculate stats @@ -422,7 +422,7 @@ impl RevIndexOps for RevIndex { match_size, gather_result_rank, sum_weighted_found, - total_weighted_hashes.try_into().unwrap(), + total_weighted_hashes, calc_abund_stats, calc_ani_ci, ani_confidence_interval_fraction, diff --git a/src/core/src/lib.rs b/src/core/src/lib.rs index 8676f683dc..d8f994f87c 100644 --- a/src/core/src/lib.rs +++ b/src/core/src/lib.rs @@ -53,6 +53,7 @@ cfg_if! { } type HashIntoType = u64; +pub type ScaledType = u32; pub fn _hash_murmur(kmer: &[u8], seed: u64) -> u64 { murmurhash3_x64_128(kmer, seed).0 diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index 2f1eca7e0f..a23776dfdc 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -15,7 +15,7 @@ use crate::encodings::HashFunctions; use crate::prelude::*; use crate::signature::SigsTrait; use crate::sketch::Sketch; -use crate::Result; +use crate::{Result, ScaledType}; /// Individual manifest record, containing information about sketches. @@ -38,7 +38,7 @@ pub struct Record { num: u32, #[getset(get = "pub")] - scaled: u64, + scaled: ScaledType, #[getset(get = "pub")] n_hashes: usize, @@ -279,7 +279,7 @@ impl Select for Manifest { }; valid = if let Some(scaled) = selection.scaled() { // num sigs have row.scaled = 0, don't include them - valid && row.scaled != 0 && row.scaled <= scaled as u64 + valid && row.scaled != 0 && row.scaled <= scaled } else { valid }; diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index 536981fa97..3c94c7f0c7 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -3,7 +3,7 @@ use typed_builder::TypedBuilder; use crate::encodings::HashFunctions; use crate::manifest::Record; -use crate::Result; +use crate::{Result, ScaledType}; #[derive(Default, Debug, TypedBuilder, Clone)] pub struct Selection { @@ -17,7 +17,7 @@ pub struct Selection { num: Option, #[builder(default, setter(strip_option))] - scaled: Option, + scaled: Option, #[builder(default, setter(strip_option))] containment: Option, @@ -87,11 +87,11 @@ impl Selection { self.num = Some(num); } - pub fn scaled(&self) -> Option { + pub fn scaled(&self) -> Option { self.scaled } - pub fn set_scaled(&mut self, scaled: u32) { + pub fn set_scaled(&mut self, scaled: ScaledType) { self.scaled = Some(scaled); } diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index 4544897054..31fc86f4af 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -812,7 +812,7 @@ impl Select for Signature { // keep compatible scaled if applicable valid = if let Some(sel_scaled) = selection.scaled() { match s { - Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled as u64, + Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled, // TODO: test LargeMinHash // Sketch::LargeMinHash(lmh) => valid && lmh.scaled() <= sel_scaled as u64, _ => valid, // other sketch types or invalid cases @@ -841,8 +841,8 @@ impl Select for Signature { for sketch in self.signatures.iter_mut() { // TODO: also account for LargeMinHash if let Sketch::MinHash(mh) = sketch { - if (mh.scaled() as u32) < sel_scaled { - *sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled as u64)?); + if mh.scaled() < sel_scaled { + *sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled)?); } } } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 7341031673..60b1c97c73 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -12,13 +12,13 @@ use serde::ser::{SerializeStruct, Serializer}; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::_hash_murmur; use crate::encodings::HashFunctions; use crate::signature::SigsTrait; use crate::sketch::hyperloglog::HyperLogLog; use crate::Error; +use crate::{ScaledType, _hash_murmur}; -pub fn max_hash_for_scaled(scaled: u64) -> u64 { +pub fn max_hash_for_scaled(scaled: ScaledType) -> u64 { match scaled { 0 => 0, // scaled == 0 indicates this is a num minhash 1 => u64::MAX, @@ -26,10 +26,10 @@ pub fn max_hash_for_scaled(scaled: u64) -> u64 { } } -pub fn scaled_for_max_hash(max_hash: u64) -> u64 { +pub fn scaled_for_max_hash(max_hash: u64) -> ScaledType { match max_hash { 0 => 0, // scaled == 0 indicates this is a num minhash - _ => (u64::MAX as f64 / max_hash as f64) as u64, + _ => (u64::MAX as f64 / max_hash as f64) as ScaledType, } } @@ -185,7 +185,7 @@ impl<'de> Deserialize<'de> for KmerMinHash { impl KmerMinHash { pub fn new( - scaled: u64, + scaled: ScaledType, ksize: u32, hash_function: HashFunctions, seed: u64, @@ -230,7 +230,7 @@ impl KmerMinHash { self.max_hash } - pub fn scaled(&self) -> u64 { + pub fn scaled(&self) -> ScaledType { scaled_for_max_hash(self.max_hash) } @@ -770,11 +770,11 @@ impl KmerMinHash { // this could be improved by generating an HLL estimate while sketching instead // (for scaled minhashes) pub fn n_unique_kmers(&self) -> u64 { - self.size() as u64 * self.scaled() // + (self.ksize - 1) for bp estimation + self.size() as u64 * self.scaled() as u64 // + (self.ksize - 1) for bp estimation } // create a downsampled copy of self - pub fn downsample_scaled(self, scaled: u64) -> Result { + pub fn downsample_scaled(self, scaled: ScaledType) -> Result { if self.scaled() == scaled || self.scaled() == 0 { Ok(self) } else if self.scaled() > scaled { @@ -1115,7 +1115,7 @@ impl<'de> Deserialize<'de> for KmerMinHashBTree { impl KmerMinHashBTree { pub fn new( - scaled: u64, + scaled: ScaledType, ksize: u32, hash_function: HashFunctions, seed: u64, @@ -1157,7 +1157,7 @@ impl KmerMinHashBTree { self.max_hash } - pub fn scaled(&self) -> u64 { + pub fn scaled(&self) -> ScaledType { scaled_for_max_hash(self.max_hash) } @@ -1552,7 +1552,7 @@ impl KmerMinHashBTree { } // create a downsampled copy of self - pub fn downsample_scaled(self, scaled: u64) -> Result { + pub fn downsample_scaled(self, scaled: ScaledType) -> Result { if self.scaled() == scaled || self.scaled() == 0 { Ok(self) } else if self.scaled() > scaled { diff --git a/src/core/src/wasm.rs b/src/core/src/wasm.rs index cd9efec091..c10eda4e6e 100644 --- a/src/core/src/wasm.rs +++ b/src/core/src/wasm.rs @@ -12,6 +12,7 @@ use crate::encodings::HashFunctions; use crate::signature::Signature as _Signature; use crate::signature::SigsTrait; use crate::sketch::minhash::KmerMinHash as _KmerMinHash; +use crate::ScaledType; #[wasm_bindgen] pub struct KmerMinHash(_KmerMinHash); @@ -32,7 +33,7 @@ impl KmerMinHash { dayhoff: bool, hp: bool, seed: u32, - scaled: u32, + scaled: ScaledType, track_abundance: bool, ) -> KmerMinHash { // TODO: at most one of (prot, dayhoff, hp) should be true @@ -48,7 +49,7 @@ impl KmerMinHash { }; KmerMinHash(_KmerMinHash::new( - scaled as u64, + scaled, ksize, hash_function, seed as u64, @@ -84,8 +85,8 @@ impl ComputeParameters { } #[wasm_bindgen] - pub fn set_scaled(&mut self, scaled: u32) { - self.0.set_scaled(scaled as u64); + pub fn set_scaled(&mut self, scaled: ScaledType) { + self.0.set_scaled(scaled); } #[wasm_bindgen] diff --git a/src/core/tests/minhash.rs b/src/core/tests/minhash.rs index 040f3506b6..bdbba0cc20 100644 --- a/src/core/tests/minhash.rs +++ b/src/core/tests/minhash.rs @@ -12,6 +12,7 @@ use sourmash::sketch::minhash::{ max_hash_for_scaled, scaled_for_max_hash, KmerMinHash, KmerMinHashBTree, }; use sourmash::sketch::Sketch; +use sourmash::ScaledType; // TODO: use f64::EPSILON when we bump MSRV const EPSILON: f64 = 0.01; @@ -328,7 +329,7 @@ fn oracle_mins_scaled(hashes in vec(u64::ANY, 1..10000)) { proptest! { #[test] fn prop_merge(seq1 in "[ACGT]{6,100}", seq2 in "[ACGT]{6,200}") { - let scaled: u64 = 10; + let scaled: ScaledType = 10; let mut a = KmerMinHash::new(scaled, 6, HashFunctions::Murmur64Dna, 42, true, 0); let mut b = KmerMinHashBTree::new(scaled, 6, HashFunctions::Murmur64Dna, 42, true, 0); diff --git a/tests/test_minhash.py b/tests/test_minhash.py index f181381485..654c4ea828 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -63,9 +63,12 @@ # * nan on empty minhash # * define equals -scaled50 = _get_scaled_for_max_hash(50) -scaled100 = _get_scaled_for_max_hash(100) -scaled5000 = _get_scaled_for_max_hash(5000) +scaled2 = _get_scaled_for_max_hash(2**63) +assert scaled2 == 2 +scaled4 = _get_scaled_for_max_hash(2**62) +assert scaled4 == 4 +scaled8 = _get_scaled_for_max_hash(2**61) +assert scaled8 == 8 def _kmers_from_all_coding_frames(sequence, ksize): @@ -473,20 +476,20 @@ def test_size_limit(track_abundance): def test_scaled(track_abundance): - # test behavior with scaled - scaled = _get_scaled_for_max_hash(35) + # test behavior with scaled: cannot add hashes above max hash. + scaled = 2**3 print("XX", scaled, _get_max_hash_for_scaled(scaled)) mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled) - assert mh._max_hash == 35 + assert mh._max_hash == 2**61 # 2**64 / 2**3 mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) assert list(sorted(mh.hashes)) == [10, 20, 30] - mh.add_hash(40) + mh.add_hash(2**62) assert list(sorted(mh.hashes)) == [10, 20, 30] - mh.add_hash(36) + mh.add_hash(2**63) assert list(sorted(mh.hashes)) == [10, 20, 30] @@ -541,8 +544,8 @@ def test_scaled_num_both(track_abundance): def test_mh_jaccard_similarity(): # check actual Jaccard value for a non-trivial case - a = MinHash(0, 20, scaled=scaled50, track_abundance=False) - b = MinHash(0, 20, scaled=scaled50, track_abundance=False) + a = MinHash(0, 20, scaled=scaled2, track_abundance=False) + b = MinHash(0, 20, scaled=scaled2, track_abundance=False) a.add_many([1, 3, 5, 8]) b.add_many([1, 3, 5, 6, 8, 10]) @@ -553,12 +556,12 @@ def test_mh_similarity_downsample_jaccard_value(): # check jaccard value after downsampling # max_hash = 50 - a = MinHash(0, 20, scaled=scaled50, track_abundance=False) + a = MinHash(0, 20, scaled=scaled2, track_abundance=False) # max_hash = 100 - b = MinHash(0, 20, scaled=scaled100, track_abundance=False) + b = MinHash(0, 20, scaled=scaled8, track_abundance=False) - a.add_many([1, 3, 5, 8, 70]) - b.add_many([1, 3, 5, 6, 8, 10, 70]) + a.add_many([1, 3, 5, 8, 2**62]) + b.add_many([1, 3, 5, 6, 8, 10, 2**62]) # the hash=70 will be truncated by downsampling assert a.similarity(b, downsample=True) == 4.0 / 6.0 @@ -569,8 +572,8 @@ def test_mh_angular_similarity(): # https://www.sciencedirect.com/topics/computer-science/cosine-similarity # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity) - a = MinHash(0, 20, scaled=scaled50, track_abundance=True) - b = MinHash(0, 20, scaled=scaled50, track_abundance=True) + a = MinHash(0, 20, scaled=scaled2, track_abundance=True) + b = MinHash(0, 20, scaled=scaled2, track_abundance=True) a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2}) b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1}) @@ -583,8 +586,8 @@ def test_mh_angular_similarity(): def test_mh_angular_similarity_2(): # check actual angular similarity for a second non-trivial case - a = MinHash(0, 20, scaled=scaled100, track_abundance=True) - b = MinHash(0, 20, scaled=scaled100, track_abundance=True) + a = MinHash(0, 20, scaled=scaled2, track_abundance=True) + b = MinHash(0, 20, scaled=scaled2, track_abundance=True) a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70}) b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70}) @@ -598,12 +601,12 @@ def test_mh_similarity_downsample_angular_value(): # test downsample=True argument to MinHash.similarity # max_hash = 50 - a = MinHash(0, 20, scaled=scaled50, track_abundance=True) + a = MinHash(0, 20, scaled=scaled2, track_abundance=True) # max_hash = 100 - b = MinHash(0, 20, scaled=scaled100, track_abundance=True) + b = MinHash(0, 20, scaled=scaled8, track_abundance=True) - a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 70: 70}) - b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 70: 70}) + a.set_abundances({1: 5, 3: 3, 5: 2, 8: 2, 2**62: 70}) + b.set_abundances({1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1, 2**62: 70}) # the hash=70 will be truncated by downsampling sim = a.similarity(b, downsample=True) @@ -617,8 +620,8 @@ def test_mh_similarity_downsample_angular_value(): def test_mh_angular_similarity_fail(): # raise TypeError if calling angular_similarity directly and # one or both sketches do not have abundance info - a = MinHash(0, 20, scaled=scaled50, track_abundance=True) - b = MinHash(0, 20, scaled=scaled50, track_abundance=False) + a = MinHash(0, 20, scaled=scaled2, track_abundance=True) + b = MinHash(0, 20, scaled=scaled2, track_abundance=False) a_values = {1: 5, 3: 3, 5: 2, 8: 2} b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} @@ -634,7 +637,7 @@ def test_mh_angular_similarity_fail(): in str(exc) ) # both sketches lack track abundance - a = MinHash(0, 20, scaled=scaled50, track_abundance=False) + a = MinHash(0, 20, scaled=scaled2, track_abundance=False) a.add_many(a_values.keys()) with pytest.raises(TypeError) as exc: a.angular_similarity(b) @@ -649,9 +652,9 @@ def test_mh_similarity_downsample_true(track_abundance): # verify sim(a, b) == sim(b, a), with and without ignore_abundance # max_hash = 50 - a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance) + a = MinHash(0, 20, scaled=scaled2, track_abundance=track_abundance) # max_hash = 100 - b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) + b = MinHash(0, 20, scaled=scaled8, track_abundance=track_abundance) a_values = {1: 5, 3: 3, 5: 2, 8: 2} b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} @@ -677,9 +680,9 @@ def test_mh_similarity_downsample_errors(track_abundance): # test downsample=False (default) argument to MinHash.similarity # max_hash = 50 - a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance) + a = MinHash(0, 20, scaled=scaled2, track_abundance=track_abundance) # max_hash = 100 - b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) + b = MinHash(0, 20, scaled=scaled8, track_abundance=track_abundance) a_values = {1: 5, 3: 3, 5: 2, 8: 2} b_values = {1: 3, 3: 2, 5: 1, 6: 1, 8: 1, 10: 1} @@ -869,14 +872,14 @@ def test_mh_count_common_diff_maxhash(track_abundance): 5, is_protein=False, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(1), + scaled=scaled8, ) b = MinHash( 0, 5, is_protein=True, track_abundance=track_abundance, - scaled=_get_scaled_for_max_hash(2), + scaled=scaled4, ) with pytest.raises(ValueError): @@ -1181,13 +1184,9 @@ def test_mh_similarity_diff_seed(track_abundance): def test_mh_compare_diff_max_hash(track_abundance): - a = MinHash( - 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5) - ) + a = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled2) - b = MinHash( - 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10) - ) + b = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled4) with pytest.raises(ValueError): a.similarity(b) @@ -1210,12 +1209,8 @@ def test_mh_concat_diff_ksize(track_abundance): def test_mh_concat_diff_max_hash(track_abundance): - a = MinHash( - 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(5) - ) - b = MinHash( - 0, 5, track_abundance=track_abundance, scaled=_get_max_hash_for_scaled(10) - ) + a = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled8) + b = MinHash(0, 5, track_abundance=track_abundance, scaled=scaled4) with pytest.raises(ValueError): a += b @@ -1514,16 +1509,14 @@ def test_mh_copy_and_clear(track_abundance): def test_mh_copy_and_clear_with_max_hash(track_abundance): # test basic creation of new, empty MinHash w/max_hash param set - a = MinHash( - 0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20) - ) - for i in range(0, 40, 2): - a.add_hash(i) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) + for i in range(44, 64): + a.add_hash(2**i) b = a.copy_and_clear() assert a.ksize == b.ksize assert b.num == a.num - assert b._max_hash == 20 + assert b._max_hash == 2**61 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed @@ -1545,21 +1538,21 @@ def test_pickle_protein(track_abundance): 10, track_abundance=track_abundance, is_protein=True, - scaled=_get_scaled_for_max_hash(20), + scaled=scaled8, ) - for i in range(0, 40, 2): - a.add_hash(i) + for i in range(44, 64): + a.add_hash(2**i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b._max_hash == a._max_hash - assert b._max_hash == 20 + assert b._max_hash == 2**61 assert b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.hashes) == len(a.hashes) - assert len(b.hashes) == 11 + assert len(b.hashes) == 18 assert a.scaled == b.scaled assert b.scaled != 0 @@ -1571,21 +1564,21 @@ def test_pickle_dayhoff(track_abundance): 10, track_abundance=track_abundance, dayhoff=True, - scaled=_get_scaled_for_max_hash(20), + scaled=scaled8, ) - for i in range(0, 40, 2): - a.add_hash(i) + for i in range(44, 64): + a.add_hash(2**i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b._max_hash == a._max_hash - assert b._max_hash == 20 + assert b._max_hash == 2**61 assert b.dayhoff assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.hashes) == len(a.hashes) - assert len(b.hashes) == 11 + assert len(b.hashes) == 18 assert a.scaled == b.scaled assert b.scaled != 0 @@ -1597,61 +1590,59 @@ def test_pickle_hp(track_abundance): 10, track_abundance=track_abundance, hp=True, - scaled=_get_scaled_for_max_hash(20), + scaled=scaled8, ) - for i in range(0, 40, 2): - a.add_hash(i) + for i in range(44, 64): + a.add_hash(2**i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b._max_hash == a._max_hash - assert b._max_hash == 20 + assert b._max_hash == 2**61 assert b.hp assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.hashes) == len(a.hashes) - assert len(b.hashes) == 11 + assert len(b.hashes) == 18 assert a.scaled == b.scaled assert b.scaled != 0 def test_pickle_max_hash(track_abundance): - a = MinHash( - 0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20) - ) - for i in range(0, 40, 2): - a.add_hash(i) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) + for i in range(44, 64): + a.add_hash(2**i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b._max_hash == a._max_hash - assert b._max_hash == 20 + assert b._max_hash == 2**61 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.hashes) == len(a.hashes) - assert len(b.hashes) == 11 + assert len(b.hashes) == 18 assert a.scaled == b.scaled assert b.scaled != 0 def test_pickle_scaled(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, scaled=922337203685477632) - for i in range(0, 40, 2): - a.add_hash(i) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) + for i in range(44, 64): + a.add_hash(2**i) b = pickle.loads(pickle.dumps(a)) assert a.ksize == b.ksize assert b.num == a.num assert b._max_hash == a._max_hash - assert b._max_hash == 20 + assert b._max_hash == 2**61 assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed assert len(b.hashes) == len(a.hashes) - assert len(b.hashes) == 11 + assert len(b.hashes) == 18 assert a.scaled == b.scaled assert b.scaled != 0 @@ -1661,7 +1652,7 @@ def test_minhash_abund_add(): # std::vector iterators upon vector resizing - in this case, there # was also a bug in inserting into the middle of mins when scaled was set. - a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled8) n = 0 for i in range(10, 0, -1): @@ -1677,7 +1668,7 @@ def test_minhash_abund_capacity_increase(): # this should set capacity to 1000 - see KmerMinHash constructor call # to 'reserve' when n > 0 for specific parameter. - a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled8) # 1001 is dependent on the value passed to reserve (currently 1000). for i in range(1001, 0, -1): @@ -1689,8 +1680,8 @@ def test_minhash_abund_merge_flat(): # of a signature with abundance and a signature without abundance. # the correct behavior for now is to calculate simple Jaccard, # i.e. 'flatten' both of them. - a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) - b = MinHash(0, 10, scaled=scaled5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled8) + b = MinHash(0, 10, scaled=scaled8) for i in range(0, 10, 2): a.add_hash(i) @@ -1707,8 +1698,8 @@ def test_minhash_abund_merge_flat_2(): # this targets a segfault caused by trying to merge # a signature with abundance and a signature without abundance. - a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) - b = MinHash(0, 10, scaled=scaled5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled2) + b = MinHash(0, 10, scaled=scaled2) for i in range(0, 10, 2): a.add_hash(i) @@ -1746,7 +1737,7 @@ def test_distance_matrix(track_abundance): def test_remove_many(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled2) a.add_many(list(range(0, 100, 2))) @@ -1766,13 +1757,13 @@ def test_remove_many(track_abundance): def test_remove_minhash(track_abundance): - original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) - added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) - tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + original_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) + added_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) + tested_mh = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) - original_mh.add_many(list(range(101))) - added_mh.add_many(list(range(101, 201))) # contains original in it - tested_mh.add_many(list(range(201))) # original + added + original_mh.add_many([2**i for i in range(62)]) + added_mh.add_many([2**63]) # contains original in it + tested_mh.add_many([2**i for i in range(63)]) # original + added # Now we should expect tested_minhash == original_minhash # Note we are passing a MinHash object instead of an iterable object @@ -1789,8 +1780,8 @@ def test_remove_minhash(track_abundance): def test_add_many(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) - b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) + b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) a.add_many(list(range(0, 100, 2))) a.add_many(list(range(0, 100, 2))) # => abundance = 2 @@ -1807,18 +1798,18 @@ def test_add_many(track_abundance): def test_set_abundances_huge(): - max_hash = 4000000 + max_hash = _get_max_hash_for_scaled(2**31) a = MinHash(0, 10, track_abundance=True, scaled=_get_scaled_for_max_hash(max_hash)) - hashes = list(range(max_hash)) + hashes = [2**i for i in range(64)] abundances = itertools.repeat(2) a.set_abundances(dict(zip(hashes, abundances))) def test_try_change_hashes(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) - MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) + MinHash(0, 10, track_abundance=track_abundance, scaled=scaled8) a.add_many(list(range(0, 100, 2))) @@ -1829,9 +1820,9 @@ def test_try_change_hashes(track_abundance): def test_flatten(): # test behavior with scaled - scaled = _get_scaled_for_max_hash(35) + scaled = scaled8 mh = MinHash(0, 4, track_abundance=True, scaled=scaled) - assert mh._max_hash == 35 + assert mh._max_hash == 2**61 mh.add_hash(10) mh.add_hash(10) @@ -1856,10 +1847,10 @@ def test_flatten(): def test_inflate(): # test behavior of inflate function - scaled = _get_scaled_for_max_hash(35) + scaled = scaled2 mh = MinHash(0, 4, track_abundance=False, scaled=scaled) mh2 = MinHash(0, 4, track_abundance=True, scaled=scaled) - assert mh._max_hash == 35 + assert mh._max_hash == 2**63 mh.add_hash(10) mh.add_hash(20) @@ -1891,10 +1882,10 @@ def test_inflate(): def test_inflate_error(): # test behavior of inflate function with 'self' as an abund sketch - scaled = _get_scaled_for_max_hash(35) + scaled = scaled2 mh = MinHash(0, 4, track_abundance=True, scaled=scaled) mh2 = MinHash(0, 4, track_abundance=True, scaled=scaled) - assert mh._max_hash == 35 + assert mh._max_hash == 2**63 mh.add_hash(10) mh.add_hash(20) @@ -1928,10 +1919,10 @@ def test_inflate_error(): def test_inflate_not_a_subset(): # test behavior of inflate function when 'from_mh' is not a subset. - scaled = _get_scaled_for_max_hash(35) + scaled = scaled2 mh = MinHash(0, 4, track_abundance=False, scaled=scaled) mh2 = MinHash(0, 4, track_abundance=True, scaled=scaled) - assert mh._max_hash == 35 + assert mh._max_hash == 2**63 mh.add_hash(10) mh.add_hash(20) diff --git a/tests/test_signature.py b/tests/test_signature.py index 6365b58856..0dd412c73e 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -191,7 +191,7 @@ def test_roundtrip_empty(track_abundance): def test_roundtrip_scaled(track_abundance): - e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=10) + e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**61) e.add_hash(5) sig = SourmashSignature(e) s = save_signatures_to_json([sig]) @@ -222,14 +222,14 @@ def test_roundtrip_seed(track_abundance): def test_similarity_downsample(track_abundance): e = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**63) - f = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**2) + f = MinHash(n=0, ksize=20, track_abundance=track_abundance, max_hash=2**61) e.add_hash(1) - e.add_hash(5) + e.add_hash(2**62) assert len(e.hashes) == 2 f.add_hash(1) - f.add_hash(5) # should be discarded due to max_hash + f.add_hash(2**62) # should be discarded due to max_hash assert len(f.hashes) == 1 ee = SourmashSignature(e)