diff --git a/src/core/src/collection.rs b/src/core/src/collection.rs index bc03274c41..aa8e33e6e8 100644 --- a/src/core/src/collection.rs +++ b/src/core/src/collection.rs @@ -215,6 +215,10 @@ impl Collection { assert_eq!(sig.signatures.len(), 1); Ok(sig) } + + pub fn intersect_manifest(&mut self, mf: &Manifest) { + self.manifest = self.manifest.intersect_manifest(mf); + } } impl Select for Collection { @@ -233,6 +237,7 @@ mod test { use super::Collection; use crate::encodings::HashFunctions; + use crate::manifest::Manifest; use crate::prelude::Select; use crate::selection::Selection; use crate::signature::Signature; @@ -358,6 +363,32 @@ mod test { assert_eq!(cl.len(), 0); } + #[test] + fn collection_intersect_manifest() { + // load test sigs + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + // four num=500 sigs + filename.push("../../tests/test-data/genome-s11.fa.gz.sig"); + let file = File::open(filename).unwrap(); + let reader = BufReader::new(file); + let sigs: Vec = serde_json::from_reader(reader).expect("Loading error"); + assert_eq!(sigs.len(), 4); + // load sigs into collection + select compatible signatures + let mut cl = Collection::from_sigs(sigs).unwrap(); + // all sigs should remain + assert_eq!(cl.len(), 4); + + // grab first record + let manifest = cl.manifest(); + let record = manifest.iter().next().unwrap().clone(); + let vr = vec![record]; + + // now intersect: + let manifest2 = Manifest::from(vr); + cl.intersect_manifest(&manifest2); + assert_eq!(cl.len(), 1); + } + #[test] fn sigstore_sig_from_record() { // load test sigs diff --git a/src/core/src/manifest.rs b/src/core/src/manifest.rs index 0a84a3e495..2f1eca7e0f 100644 --- a/src/core/src/manifest.rs +++ b/src/core/src/manifest.rs @@ -1,4 +1,6 @@ +use std::collections::HashSet; use std::fs::File; +use std::hash::{Hash, Hasher}; use std::io::{BufRead, BufReader, Read, Write}; use std::ops::Deref; @@ -17,7 +19,7 @@ use crate::Result; /// Individual manifest record, containing information about sketches. -#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters, PartialEq, Eq)] +#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters)] pub struct Record { #[getset(get = "pub", set = "pub")] internal_location: PathBuf, @@ -176,6 +178,37 @@ impl Record { } } +impl PartialEq for Record { + // match everything but internal_location + fn eq(&self, other: &Self) -> bool { + self.md5 == other.md5 + && self.ksize == other.ksize + && self.moltype == other.moltype + && self.scaled == other.scaled + && self.num == other.num + && self.n_hashes == other.n_hashes + && self.with_abundance == other.with_abundance + && self.name == other.name + && self.filename == other.filename + } +} + +impl Eq for Record {} + +impl Hash for Record { + fn hash(&self, state: &mut H) { + self.md5.hash(state); + self.ksize.hash(state); + self.moltype.hash(state); + self.scaled.hash(state); + self.num.hash(state); + self.n_hashes.hash(state); + self.with_abundance.hash(state); + self.name.hash(state); + self.filename.hash(state); + } +} + impl Manifest { pub fn from_reader(rdr: R) -> Result { let mut records = vec![]; @@ -209,6 +242,20 @@ impl Manifest { pub fn iter(&self) -> impl Iterator { self.records.iter() } + + pub fn intersect_manifest(&self, other: &Manifest) -> Self { + // extract tuples from other mf: + let pairs: HashSet<_> = other.iter().collect(); + + let records = self + .records + .iter() + .filter(|row| pairs.contains(row)) + .cloned() + .collect(); + + Self { records } + } } impl Select for Manifest { @@ -521,4 +568,60 @@ mod test { let scaled100 = manifest.select(&selection).unwrap(); assert_eq!(scaled100.len(), 6); } + + #[test] + fn manifest_intersect() { + let temp_dir = TempDir::new().unwrap(); + let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf()) + .expect("Path should be valid UTF-8"); + let filename = utf8_output.join("sig-pathlist.txt"); + // build sig filenames + let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let test_sigs = vec![ + "../../tests/test-data/47.fa.sig", + "../../tests/test-data/63.fa.sig", + ]; + + let full_paths: Vec<_> = test_sigs + .into_iter() + .map(|sig| base_path.join(sig)) + .collect(); + + // write a file in test directory with a filename on each line + let mut pathfile = File::create(&filename).unwrap(); + for sigfile in &full_paths { + writeln!(pathfile, "{}", sigfile).unwrap(); + } + + // load into manifest + let manifest = Manifest::from(&filename); + assert_eq!(manifest.len(), 2); + + // now do just one sketch - + let test_sigs2 = vec!["../../tests/test-data/63.fa.sig"]; + + let filename2 = utf8_output.join("sig-pathlist-single.txt"); + + let full_paths: Vec<_> = test_sigs2 + .into_iter() + .map(|sig| base_path.join(sig)) + .collect(); + + let mut pathfile2 = File::create(&filename2).unwrap(); + for sigfile in &full_paths { + writeln!(pathfile2, "{}", sigfile).unwrap(); + } + + // load into another manifest + let manifest2 = Manifest::from(&filename2); + assert_eq!(manifest2.len(), 1); + + // intersect with itself => same. + let new_mf = manifest2.intersect_manifest(&manifest); + assert_eq!(new_mf.len(), 1); + + // intersect with other => single. + let new_mf = manifest.intersect_manifest(&manifest2); + assert_eq!(new_mf.len(), 1); + } }