diff --git a/include/sourmash.h b/include/sourmash.h index 8e80d22834..d95dd07888 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -13,6 +13,8 @@ enum HashFunctions { HASH_FUNCTIONS_MURMUR64_PROTEIN = 2, HASH_FUNCTIONS_MURMUR64_DAYHOFF = 3, HASH_FUNCTIONS_MURMUR64_HP = 4, + HASH_FUNCTIONS_MURMUR64_SKIPM1N3 = 5, + HASH_FUNCTIONS_MURMUR64_SKIPM2N3 = 6, }; typedef uint32_t HashFunctions; @@ -274,6 +276,10 @@ double kmerminhash_similarity(const SourmashKmerMinHash *ptr, bool ignore_abundance, bool downsample); +bool kmerminhash_skipm1n3(const SourmashKmerMinHash *ptr); + +bool kmerminhash_skipm2n3(const SourmashKmerMinHash *ptr); + void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); diff --git a/src/core/src/ffi/minhash.rs b/src/core/src/ffi/minhash.rs index baad446a0f..bb95242aa6 100644 --- a/src/core/src/ffi/minhash.rs +++ b/src/core/src/ffi/minhash.rs @@ -331,6 +331,18 @@ pub unsafe extern "C" fn kmerminhash_hp(ptr: *const SourmashKmerMinHash) -> bool mh.hp() } +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_skipm1n3(ptr: *const SourmashKmerMinHash) -> bool { + let mh = SourmashKmerMinHash::as_rust(ptr); + mh.skipm1n3() +} + +#[no_mangle] +pub unsafe extern "C" fn kmerminhash_skipm2n3(ptr: *const SourmashKmerMinHash) -> bool { + let mh = SourmashKmerMinHash::as_rust(ptr); + mh.skipm2n3() +} + #[no_mangle] pub unsafe extern "C" fn kmerminhash_seed(ptr: *const SourmashKmerMinHash) -> u64 { let mh = SourmashKmerMinHash::as_rust(ptr); diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index 6f1dff78e4..dc8fb14c76 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -36,18 +36,23 @@ pub enum HashFunctions { Murmur64Protein = 2, Murmur64Dayhoff = 3, Murmur64Hp = 4, + Murmur64Skipm1n3 = 5, + Murmur64Skipm2n3 = 6, } impl From for crate::encodings::HashFunctions { fn from(v: HashFunctions) -> crate::encodings::HashFunctions { use crate::encodings::HashFunctions::{ - Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, + Murmur64Skipm2n3, }; match v { HashFunctions::Murmur64Dna => Murmur64Dna, HashFunctions::Murmur64Protein => Murmur64Protein, HashFunctions::Murmur64Dayhoff => Murmur64Dayhoff, HashFunctions::Murmur64Hp => Murmur64Hp, + HashFunctions::Murmur64Skipm1n3 => Murmur64Skipm1n3, + HashFunctions::Murmur64Skipm2n3 => Murmur64Skipm2n3, } } } @@ -55,13 +60,16 @@ impl From for crate::encodings::HashFunctions { impl From for HashFunctions { fn from(v: crate::encodings::HashFunctions) -> HashFunctions { use crate::encodings::HashFunctions::{ - Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, + Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3, + Murmur64Skipm2n3, }; match v { Murmur64Dna => HashFunctions::Murmur64Dna, Murmur64Protein => HashFunctions::Murmur64Protein, Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff, Murmur64Hp => HashFunctions::Murmur64Hp, + Murmur64Skipm1n3 => HashFunctions::Murmur64Skipm1n3, + Murmur64Skipm2n3 => HashFunctions::Murmur64Skipm2n3, _ => todo!("Not supported, probably custom"), } } diff --git a/src/core/src/sketch/minhash.rs b/src/core/src/sketch/minhash.rs index 43f8b602ea..d64efc63dc 100644 --- a/src/core/src/sketch/minhash.rs +++ b/src/core/src/sketch/minhash.rs @@ -723,6 +723,14 @@ impl KmerMinHash { self.hash_function == HashFunctions::Murmur64Hp } + pub fn skipm1n3(&self) -> bool { + self.hash_function == HashFunctions::Murmur64Skipm1n3 + } + + pub fn skipm2n3(&self) -> bool { + self.hash_function == HashFunctions::Murmur64Skipm2n3 + } + pub fn mins(&self) -> Vec { self.mins.clone() } diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 26da5ead5f..467ed307fb 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -50,6 +50,39 @@ def add_moltype_args(parser): ) parser.set_defaults(hp=False) + parser.add_argument( + "--skipm1n3", + "--skipmer-m1n3", + dest="skipm1n3", + action="store_true", + help="choose skipmer (m1n3) signatures", + ) + + parser.add_argument( + "--no-skipm1n3", + "--no-skipmer-m1n3", + dest="skipm1n3", + action="store_false", + help="do not choose skipmer (m1n3) signatures", + ) + parser.set_defaults(skipm1n3=False) + + parser.add_argument( + "--skipm2n3", + "--skipmer-m2n3", + dest="skipm2n3", + action="store_true", + help="choose skipmer (m2n3) signatures", + ) + parser.add_argument( + "--no-skipm2n3", + "--no-skipmer-m2n3", + dest="skipm2n3", + action="store_false", + help="do not choose skipmer (m2n3) signatures", + ) + parser.set_defaults(skipm2n3=False) + parser.add_argument( "--dna", "--rna", diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index a8a638653d..b04952d873 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -1242,7 +1242,7 @@ def _check_select_parameters(**kw): moltype = kw.get("moltype") if moltype is not None: - if moltype not in ["DNA", "protein", "dayhoff", "hp"]: + if moltype not in ["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"]: raise ValueError(f"unknown moltype: {moltype}") scaled = kw.get("scaled") diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 90d8244e2d..c036fa6c3d 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -196,6 +196,8 @@ def __init__( is_protein=False, dayhoff=False, hp=False, + skipm1n3=False, + skipm2n3=False, track_abundance=False, seed=MINHASH_DEFAULT_SEED, max_hash=0, @@ -215,6 +217,8 @@ def __init__( * is_protein (default False) - aa k-mers * dayhoff (default False) - dayhoff encoding * hp (default False) - hydrophilic/hydrophobic aa + * skipm1n3 (default False) - skipmer (m1n3) + * skipm2n3 (default False) - skipmer (m2n3) * track_abundance (default False) - track hash multiplicity * mins (default None) - list of hashvals, or (hashval, abund) pairs * seed (default 42) - murmurhash seed @@ -243,6 +247,10 @@ def __init__( elif is_protein: hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN ksize = ksize * 3 + elif skipm1n3: + hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3 + elif skipm2n3: + hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3 else: hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA @@ -264,6 +272,8 @@ def __copy__(self): is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, + skipm1n3=self.skipm1n3, + skipm2n3=self.skipm2n3, track_abundance=self.track_abundance, seed=self.seed, max_hash=self._max_hash, @@ -279,12 +289,20 @@ def __getstate__(self): # note: we multiple ksize by 3 here so that # pickle protocols that bypass __setstate__ # get a ksize that makes sense to the Rust layer. See #2262. + # CTB/NTP note: if you add things below, you might want to put + # them at the end, because we use internal indexes in a few places. + # see especially `_set_num_scaled()` in sig/__main__.my. + # My apologies. return ( self.num, - self.ksize if self.is_dna else self.ksize * 3, + self.ksize + if self.is_dna or self.skipm1n3 or self.skipm2n3 + else self.ksize * 3, self.is_protein, self.dayhoff, self.hp, + self.skipm1n3, + self.skipm2n3, self.hashes, None, self.track_abundance, @@ -300,6 +318,8 @@ def __setstate__(self, tup): is_protein, dayhoff, hp, + skipm1n3, + skipm2n3, mins, _, track_abundance, @@ -316,6 +336,10 @@ def __setstate__(self, tup): if hp else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3 + if skipm1n3 + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3 + if skipm2n3 else lib.HASH_FUNCTIONS_MURMUR64_DNA ) @@ -340,6 +364,8 @@ def copy_and_clear(self): is_protein=self.is_protein, dayhoff=self.dayhoff, hp=self.hp, + skipm1n3=self.skipm1n3, + skipm2n3=self.skipm2n3, track_abundance=self.track_abundance, seed=self.seed, max_hash=self._max_hash, @@ -461,7 +487,7 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False): def add_kmer(self, kmer): "Add a kmer into the sketch." - if self.is_dna: + if self.is_dna or self.skipm1n3 or self.skipm2n3: if len(kmer) != self.ksize: raise ValueError(f"kmer to add is not {self.ksize} in length") else: @@ -561,7 +587,9 @@ def scaled(self): @property def is_dna(self): - return not (self.is_protein or self.dayhoff or self.hp) + return not ( + self.is_protein or self.dayhoff or self.hp or self.skipm1n3 or self.skipm2n3 + ) @property def is_protein(self): @@ -575,10 +603,18 @@ def dayhoff(self): def hp(self): return self._methodcall(lib.kmerminhash_hp) + @property + def skipm1n3(self): + return self._methodcall(lib.kmerminhash_skipm1n3) + + @property + def skipm2n3(self): + return self._methodcall(lib.kmerminhash_skipm2n3) + @property def ksize(self): k = self._methodcall(lib.kmerminhash_ksize) - if not self.is_dna: + if not self.is_dna and not self.skipm1n3 and not self.skipm2n3: assert k % 3 == 0 k = int(k / 3) return k @@ -1051,6 +1087,10 @@ def moltype(self): # TODO: test in minhash tests return "dayhoff" elif self.hp: return "hp" + elif self.skipm1n3: + return "skipm1n3" + elif self.skipm2n3: + return "skipm2n3" else: return "DNA" @@ -1224,6 +1264,8 @@ def __setstate__(self, tup): is_protein, dayhoff, hp, + skipm1n3, + skipm2n3, mins, _, track_abundance, @@ -1240,6 +1282,10 @@ def __setstate__(self, tup): if hp else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN if is_protein + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3 + if skipm1n3 + else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3 + if skipm2n3 else lib.HASH_FUNCTIONS_MURMUR64_DNA ) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 0b97c84347..5809270284 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -102,7 +102,7 @@ def _set_num_scaled(mh, num, scaled): # Number of hashes is 0th parameter mh_params[0] = num # Scale is 8th parameter - mh_params[8] = _get_max_hash_for_scaled(scaled) + mh_params[10] = _get_max_hash_for_scaled(scaled) mh.__setstate__(mh_params) assert mh.num == num assert mh.scaled == scaled diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 9f2068160a..20028fa921 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -84,7 +84,7 @@ def check_num_bounds(arg): def get_moltype(sig, require=False): mh = sig.minhash - if mh.moltype in ("DNA", "dayhoff", "hp", "protein"): + if mh.moltype in ("DNA", "dayhoff", "hp", "protein", "skipm1n3", "skipm2n3"): moltype = mh.moltype else: raise ValueError(f"unknown molecule type for sig {sig}") @@ -109,9 +109,15 @@ def calculate_moltype(args, default=None): moltype = "protein" n += 1 + if args.skipm1n3: + moltype = "skipm1n3" + + if args.skipm2n3: + moltype = "skipm2n3" + if n > 1: error( - "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff" + "cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff/--skipm1n3/--skipm2n3" ) sys.exit(-1) @@ -185,11 +191,13 @@ def load_include_exclude_db_patterns(args): def search_pattern(vals): return any(pattern.search(val) for val in vals) + elif args.exclude_db_pattern: pattern = re.compile(args.exclude_db_pattern, re.IGNORECASE) def search_pattern(vals): return all(not pattern.search(val) for val in vals) + else: search_pattern = None