From 25664b5751b347829fd0e9c3efa96903f62fd3ab Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sun, 17 Sep 2023 14:24:15 -0700 Subject: [PATCH] Initial support for custom hash function --- .github/workflows/rust.yml | 2 +- src/core/src/encodings.rs | 5 ++++- src/core/src/ffi/mod.rs | 3 ++- src/core/src/from.rs | 6 +++--- src/core/src/selection.rs | 2 +- src/core/src/signature.rs | 10 +++++----- src/core/src/sketch/minhash.rs | 22 +++++++++++----------- src/core/src/wasm.rs | 8 ++++---- 8 files changed, 31 insertions(+), 27 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 855573a106..3b2535f033 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -109,7 +109,7 @@ jobs: run: cargo binstall --no-confirm cargo-tarpaulin - name: Coverage with tarpaulin - run: cargo tarpaulin --all --all-features --timeout 600 --out Xml -- --test-threads 1 + run: cargo tarpaulin --all --all-features --timeout 600 --out xml -- --test-threads 1 - name: Upload Rust coverage to codecov uses: codecov/codecov-action@v3 diff --git a/src/core/src/encodings.rs b/src/core/src/encodings.rs index 752b6d892f..ac69cd58eb 100644 --- a/src/core/src/encodings.rs +++ b/src/core/src/encodings.rs @@ -22,16 +22,18 @@ pub type Idx = u32; type IdxTracker = (vec_collections::VecSet<[Idx; 8]>, u64); type ColorToIdx = HashMap>; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr( feature = "rkyv", derive(rkyv::Serialize, rkyv::Deserialize, rkyv::Archive) )] +#[non_exhaustive] pub enum HashFunctions { Murmur64Dna, Murmur64Protein, Murmur64Dayhoff, Murmur64Hp, + Custom(String), } impl HashFunctions { @@ -62,6 +64,7 @@ impl std::fmt::Display for HashFunctions { HashFunctions::Murmur64Protein => "protein", HashFunctions::Murmur64Dayhoff => "dayhoff", HashFunctions::Murmur64Hp => "hp", + HashFunctions::Custom(v) => v, } ) } diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index 6e28c648cf..6f1dff78e4 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -1,6 +1,6 @@ //! # Foreign Function Interface for calling sourmash from a C API //! -//! Primary client for now is the Python version, using CFFI and milksnake. +//! Primary client for now is the Python version, using CFFI and maturin. #![allow(clippy::missing_safety_doc)] #[macro_use] @@ -62,6 +62,7 @@ impl From for HashFunctions { Murmur64Protein => HashFunctions::Murmur64Protein, Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff, Murmur64Hp => HashFunctions::Murmur64Hp, + _ => todo!("Not supported, probably custom"), } } } diff --git a/src/core/src/from.rs b/src/core/src/from.rs index 7847714cfe..dbeeb58a2f 100644 --- a/src/core/src/from.rs +++ b/src/core/src/from.rs @@ -17,7 +17,7 @@ impl From for KmerMinHash { let mut new_mh = KmerMinHash::new( 0, values.get(0).unwrap().kmer.len() as u32, - HashFunctions::murmur64_DNA, + HashFunctions::Murmur64Dna, 42, true, values.len() as u32, @@ -51,7 +51,7 @@ mod test { #[test] fn finch_behavior() { - let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20); + let mut a = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, true, 20); let mut b = MashSketcher::new(20, 10, 42); let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; @@ -87,7 +87,7 @@ mod test { #[test] fn from_finch() { - let mut a = KmerMinHash::new(0, 10, HashFunctions::murmur64_DNA, 42, true, 20); + let mut a = KmerMinHash::new(0, 10, HashFunctions::Murmur64Dna, 42, true, 20); let mut b = MashSketcher::new(20, 10, 42); let seq = b"TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA"; diff --git a/src/core/src/selection.rs b/src/core/src/selection.rs index 86d42273e0..cfe871663f 100644 --- a/src/core/src/selection.rs +++ b/src/core/src/selection.rs @@ -104,7 +104,7 @@ impl Selection { } pub fn moltype(&self) -> Option { - self.moltype + self.moltype.clone() } pub fn set_moltype(&mut self, value: HashFunctions) { diff --git a/src/core/src/signature.rs b/src/core/src/signature.rs index b521191806..f5cb9a2b4e 100644 --- a/src/core/src/signature.rs +++ b/src/core/src/signature.rs @@ -371,7 +371,7 @@ impl Iterator for SeqToHashes { Some(Ok(hash)) } else { if !self.prot_configured { - self.aa_seq = match self.hash_function { + self.aa_seq = match &self.hash_function { HashFunctions::Murmur64Dayhoff => { self.sequence.iter().cloned().map(aa_to_dayhoff).collect() } @@ -584,9 +584,9 @@ impl Signature { } }; - match moltype { + match &moltype { Some(x) => { - if mh.hash_function() == x { + if mh.hash_function() == *x { return true; } } @@ -600,9 +600,9 @@ impl Signature { } }; - match moltype { + match &moltype { Some(x) => { - if mh.hash_function() == x { + if mh.hash_function() == *x { return true; } } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 22fe7159c3..36f11a589e 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -74,7 +74,7 @@ impl Clone for KmerMinHash { KmerMinHash { num: self.num, ksize: self.ksize, - hash_function: self.hash_function, + hash_function: self.hash_function.clone(), seed: self.seed, max_hash: self.max_hash, mins: self.mins.clone(), @@ -579,7 +579,7 @@ impl KmerMinHash { let mut combined_mh = KmerMinHash::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -612,7 +612,7 @@ impl KmerMinHash { let mut combined_mh = KmerMinHash::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -741,7 +741,7 @@ impl KmerMinHash { let mut new_mh = KmerMinHash::new( scaled, self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -805,7 +805,7 @@ impl SigsTrait for KmerMinHash { } fn hash_function(&self) -> HashFunctions { - self.hash_function + self.hash_function.clone() } fn add_hash(&mut self, hash: u64) { @@ -979,7 +979,7 @@ impl Clone for KmerMinHashBTree { KmerMinHashBTree { num: self.num, ksize: self.ksize, - hash_function: self.hash_function, + hash_function: self.hash_function.clone(), seed: self.seed, max_hash: self.max_hash, mins: self.mins.clone(), @@ -1372,7 +1372,7 @@ impl KmerMinHashBTree { let mut combined_mh = KmerMinHashBTree::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -1404,7 +1404,7 @@ impl KmerMinHashBTree { let mut combined_mh = KmerMinHashBTree::new( self.scaled(), self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -1500,7 +1500,7 @@ impl KmerMinHashBTree { } pub fn hash_function(&self) -> HashFunctions { - self.hash_function + self.hash_function.clone() } pub fn mins(&self) -> Vec { @@ -1524,7 +1524,7 @@ impl KmerMinHashBTree { let mut new_mh = KmerMinHashBTree::new( scaled, self.ksize, - self.hash_function, + self.hash_function.clone(), self.seed, self.abunds.is_some(), self.num, @@ -1574,7 +1574,7 @@ impl SigsTrait for KmerMinHashBTree { } fn hash_function(&self) -> HashFunctions { - self.hash_function + self.hash_function.clone() } fn add_hash(&mut self, hash: u64) { diff --git a/src/core/src/wasm.rs b/src/core/src/wasm.rs index ad656d9955..c2a0eb6c30 100644 --- a/src/core/src/wasm.rs +++ b/src/core/src/wasm.rs @@ -37,13 +37,13 @@ impl KmerMinHash { // TODO: at most one of (prot, dayhoff, hp) should be true let hash_function = if dayhoff { - HashFunctions::murmur64_dayhoff + HashFunctions::Murmur64Dayhoff } else if hp { - HashFunctions::murmur64_hp + HashFunctions::Murmur64Hp } else if is_protein { - HashFunctions::murmur64_protein + HashFunctions::Murmur64Protein } else { - HashFunctions::murmur64_DNA + HashFunctions::Murmur64Dna }; KmerMinHash(_KmerMinHash::new(