From 97e7808fbdc1b9f993beb66a1b141c68ddafed32 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 13 Nov 2024 10:22:19 -0800 Subject: [PATCH] MRG: change `sig_from_record` to use scaled from `Record` to downsample (#3387) Fixes https://github.com/sourmash-bio/sourmash/issues/3384. This PR changes `Manifest::select` so that if `scaled` is set in the selection, all matching `Record`s have their scaled value updated. It also updates `Selection::from_record` to set `scaled` to match the `Record` scaled value. In turn, this allows `Collection::sig_from_record` to respect the specified `scaled` value when loading `Signature`. --------- Co-authored-by: Luiz Irber --- src/core/Cargo.toml | 2 +- src/core/src/collection.rs | 11 +++---- src/core/src/manifest.rs | 63 ++++++++++++++++---------------------- src/core/src/selection.rs | 2 +- 4 files changed, 33 insertions(+), 45 deletions(-) diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index bb309900c7..fc00b075ab 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "sourmash" version = "0.17.1" -authors = ["Luiz Irber ", "N. Tessa Pierce-Ward "] +authors = ["Luiz Irber ", "N. Tessa Pierce-Ward ", "C. Titus Brown "] description = "tools for comparing biological sequences with k-mer sketches" repository = "https://github.com/sourmash-bio/sourmash" keywords = ["minhash", "bioinformatics"] diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index 28659df05f..fcfbabbebb 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -409,8 +409,8 @@ mod test { // no sigs should remain assert_eq!(cl.len(), 6); for (_idx, rec) in cl.iter() { - // need to pass select again here so we actually downsample - let this_sig = cl.sig_from_record(rec).unwrap().select(&selection).unwrap(); + dbg!("record scaled is: {}", rec.scaled()); + let this_sig = cl.sig_from_record(rec).unwrap(); let this_mh = this_sig.minhash().unwrap(); assert_eq!(this_mh.scaled(), 2000); } @@ -440,7 +440,7 @@ mod test { filename.push("../../tests/test-data/prot/hp.zip"); // create Selection object let mut selection = Selection::default(); - selection.set_scaled(100); + selection.set_scaled(200); selection.set_moltype(HashFunctions::Murmur64Hp); // load sigs into collection + select compatible signatures let cl = Collection::from_zipfile(&filename) @@ -450,10 +450,9 @@ mod test { // count collection length assert_eq!(cl.len(), 2); for (idx, _rec) in cl.iter() { - // need to pass select again here so we actually downsample - let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap(); + let this_sig = cl.sig_for_dataset(idx).unwrap(); let this_mh = this_sig.minhash().unwrap(); - assert_eq!(this_mh.scaled(), 100); + assert_eq!(this_mh.scaled(), 200); } } diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index a23776dfdc..21f8ecbc5d 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -259,8 +259,13 @@ impl Manifest { } impl Select for Manifest { + // select only records that satisfy selection conditions; also update + // scaled value to match. fn select(self, selection: &Selection) -> Result { - let rows = self.records.iter().filter(|row| { + let Manifest { mut records } = self; + + // TODO: with num as well? + records.retain_mut(|row| { let mut valid = true; valid = if let Some(ksize) = selection.ksize() { row.ksize == ksize @@ -279,7 +284,12 @@ impl Select for Manifest { }; valid = if let Some(scaled) = selection.scaled() { // num sigs have row.scaled = 0, don't include them - valid && row.scaled != 0 && row.scaled <= scaled + let v = valid && row.scaled != 0 && row.scaled <= scaled; + // if scaled is set, update! + if v { + row.scaled = scaled + }; + v } else { valid }; @@ -291,41 +301,7 @@ impl Select for Manifest { valid }); - Ok(Manifest { - records: rows.cloned().collect(), - }) - - /* - matching_rows = self.rows - if ksize: - matching_rows = ( row for row in matching_rows - if row['ksize'] == ksize ) - if moltype: - matching_rows = ( row for row in matching_rows - if row['moltype'] == moltype ) - if scaled or containment: - if containment and not scaled: - raise ValueError("'containment' requires 'scaled' in Index.select'") - - matching_rows = ( row for row in matching_rows - if row['scaled'] and not row['num'] ) - if num: - matching_rows = ( row for row in matching_rows - if row['num'] and not row['scaled'] ) - - if abund: - # only need to concern ourselves if abundance is _required_ - matching_rows = ( row for row in matching_rows - if row['with_abundance'] ) - - if picklist: - matching_rows = ( row for row in matching_rows - if picklist.matches_manifest_row(row) ) - - # return only the internal filenames! - for row in matching_rows: - yield row - */ + Ok(Manifest { records }) } } @@ -567,6 +543,19 @@ mod test { selection.set_scaled(100); let scaled100 = manifest.select(&selection).unwrap(); assert_eq!(scaled100.len(), 6); + + // check that 'scaled' is updated + let manifest = collection.manifest().clone(); + selection = Selection::default(); + selection.set_scaled(400); + let scaled400 = manifest.select(&selection).unwrap(); + assert_eq!(scaled400.len(), 6); + let max_scaled = scaled400 + .iter() + .map(|r| r.scaled()) + .max() + .expect("no records?!"); + assert_eq!(*max_scaled, 400); } #[test] diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index 3c94c7f0c7..59f83c1146 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -125,7 +125,7 @@ impl Selection { abund: Some(row.with_abundance()), moltype: Some(row.moltype()), num: None, - scaled: None, + scaled: Some(*row.scaled()), containment: None, picklist: None, })