From 4841d43126054e8d179e2a9de988f7e734c600e6 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 13 Oct 2024 20:49:53 -0700 Subject: [PATCH 1/2] remove match_ --- src/core/src/index/linear.rs | 2 -- src/core/src/index/mod.rs | 21 +++++++-------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 489e69c86e..6f833b7fca 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -183,7 +183,6 @@ impl LinearIndex { let intersect_bp: u64 = match_mh.scaled() as u64 * intersect_orig; let f_unique_to_query = intersect_orig as f64 / query.size() as f64; - let match_ = match_sig; // TODO: all of these let f_unique_weighted = 0.; @@ -217,7 +216,6 @@ impl LinearIndex { filename, name, md5, - match_, f_match_orig, unique_intersect_bp, gather_result_rank, diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 0bd9d9fec8..c894d1c02b 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -65,9 +65,6 @@ pub struct GatherResult { #[getset(get = "pub")] md5: String, - #[serde(skip)] - match_: SigStore, - #[getset(get_copy = "pub")] f_match_orig: f64, @@ -118,12 +115,6 @@ pub struct GatherResult { max_containment_ani: f64, } -impl GatherResult { - pub fn get_match(&self) -> Signature { - self.match_.clone().into() - } -} - type SigCounter = counter::Counter; pub trait Index<'a> { @@ -219,8 +210,11 @@ pub fn calculate_gather_stats( calc_ani_ci: bool, confidence: Option, ) -> Result<(GatherResult, (Vec, u64))> { + let match_filename = match_sig.filename(); + let match_name = match_sig.name(); + let match_md5 = match_sig.md5sum(); // get match_mh - let match_mh = match_sig.minhash().expect("cannot retrieve sketch"); + let match_mh: KmerMinHash = match_sig.try_into()?; // it's ok to downsample match, but query is often big and repeated, // so we do not allow downsampling of query in this function. @@ -330,10 +324,9 @@ pub fn calculate_gather_stats( .average_abund(average_abund) .median_abund(median_abund) .std_abund(std_abund) - .filename(match_sig.filename()) - .name(match_sig.name()) - .md5(match_sig.md5sum()) - .match_(match_sig) + .filename(match_filename) + .name(match_name) + .md5(match_md5) .f_match_orig(f_match_orig) .unique_intersect_bp(unique_intersect_bp) .gather_result_rank(gather_result_rank) From 06867ac3cfabbabd393b4c9d3c1efa04d92b94da Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 11 Nov 2024 17:25:00 -0800 Subject: [PATCH 2/2] remove match_ from mem revindex gather, still working on FFI --- src/core/src/ffi/index/revindex.rs | 5 +--- src/core/src/index/revindex/mem_revindex.rs | 31 +++++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/core/src/ffi/index/revindex.rs b/src/core/src/ffi/index/revindex.rs index ef0e328139..db16e1d40b 100644 --- a/src/core/src/ffi/index/revindex.rs +++ b/src/core/src/ffi/index/revindex.rs @@ -207,16 +207,13 @@ unsafe fn revindex_gather( let threshold: usize = (threshold * (mh.size() as f64)) as _; let counter = revindex.counter_for_query(mh); - dbg!(&counter); let results: Vec<(f64, Signature, String)> = revindex .gather(counter, threshold, mh) .unwrap() // TODO: proper error handling .into_iter() .map(|r| { - let filename = r.filename().to_owned(); - let sig = r.get_match(); - (r.f_match(), sig, filename) + todo!() }) .collect(); diff --git a/src/core/src/index/revindex/mem_revindex.rs b/src/core/src/index/revindex/mem_revindex.rs index 08b7bc56ac..1bd1ba9db7 100644 --- a/src/core/src/index/revindex/mem_revindex.rs +++ b/src/core/src/index/revindex/mem_revindex.rs @@ -208,26 +208,29 @@ impl RevIndex { let mut matches = vec![]; while match_size > threshold && !counter.is_empty() { - let (dataset_id, size) = counter.most_common()[0]; + let (dataset_id, size) = counter.k_most_common_ordered(1)[0]; match_size = if size >= threshold { size } else { break }; let result = self .linear .gather_round(dataset_id, match_size, query, matches.len())?; - if let Some(Sketch::MinHash(match_mh)) = - result.match_.select_sketch(self.linear.template()) - { - // Prepare counter for finding the next match by decrementing - // all hashes found in the current match in other datasets - for hash in match_mh.iter_mins() { - if let Some(color) = self.hash_to_color.get(hash) { - counter.subtract(self.colors.indices(color).cloned()); - } + + // handle special case where threshold was set to 0 + if match_size == 0 { + break; + } + + let match_sig = self.linear.collection().sig_for_dataset(dataset_id)?; + let match_mh = match_sig.minhash().unwrap().clone(); + + // Prepare counter for finding the next match by decrementing + // all hashes found in the current match in other datasets + for hash in match_mh.iter_mins() { + if let Some(color) = self.hash_to_color.get(hash) { + counter.subtract(self.colors.indices(color).cloned()); } - counter.remove(&dataset_id); - matches.push(result); - } else { - unimplemented!() } + counter.remove(&dataset_id); + matches.push(result); } Ok(matches) }