Skip to content

Commit

Permalink
MRG: add skipmer capacity to sourmash python layer via ffi (#3446)
Browse files Browse the repository at this point in the history
This PR updates the FFI and python layer to allow the skipmer moltypes
(`skipm1n3`, `skipm2n3`). We will keep this as an undocumented
experimental feature for now. There are no guarantees at the moment that
skipmers will work with all sourmash commands, as there are no explicit
tests in place. There *are* tests for using skipmers with branchwater,
so all skipmer searching is best done in the plugin for now. This PR
enables critical handy utilities, though: `sig cat`, `sig summarize`,
`sig describe`, etc.

Documentation and additional tests should be added prior to
release(#3449).

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: C. Titus Brown <[email protected]>
  • Loading branch information
3 people authored Dec 20, 2024
1 parent 419eb73 commit 150967b
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 10 deletions.
6 changes: 6 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ enum HashFunctions {
HASH_FUNCTIONS_MURMUR64_PROTEIN = 2,
HASH_FUNCTIONS_MURMUR64_DAYHOFF = 3,
HASH_FUNCTIONS_MURMUR64_HP = 4,
HASH_FUNCTIONS_MURMUR64_SKIPM1N3 = 5,
HASH_FUNCTIONS_MURMUR64_SKIPM2N3 = 6,
};
typedef uint32_t HashFunctions;

Expand Down Expand Up @@ -274,6 +276,10 @@ double kmerminhash_similarity(const SourmashKmerMinHash *ptr,
bool ignore_abundance,
bool downsample);

bool kmerminhash_skipm1n3(const SourmashKmerMinHash *ptr);

bool kmerminhash_skipm2n3(const SourmashKmerMinHash *ptr);

void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize);

bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr);
Expand Down
12 changes: 12 additions & 0 deletions src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,18 @@ pub unsafe extern "C" fn kmerminhash_hp(ptr: *const SourmashKmerMinHash) -> bool
mh.hp()
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_skipm1n3(ptr: *const SourmashKmerMinHash) -> bool {
let mh = SourmashKmerMinHash::as_rust(ptr);
mh.skipm1n3()
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_skipm2n3(ptr: *const SourmashKmerMinHash) -> bool {
let mh = SourmashKmerMinHash::as_rust(ptr);
mh.skipm2n3()
}

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_seed(ptr: *const SourmashKmerMinHash) -> u64 {
let mh = SourmashKmerMinHash::as_rust(ptr);
Expand Down
12 changes: 10 additions & 2 deletions src/core/src/ffi/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,40 @@ pub enum HashFunctions {
Murmur64Protein = 2,
Murmur64Dayhoff = 3,
Murmur64Hp = 4,
Murmur64Skipm1n3 = 5,
Murmur64Skipm2n3 = 6,
}

impl From<HashFunctions> for crate::encodings::HashFunctions {
fn from(v: HashFunctions) -> crate::encodings::HashFunctions {
use crate::encodings::HashFunctions::{
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein,
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3,
Murmur64Skipm2n3,
};
match v {
HashFunctions::Murmur64Dna => Murmur64Dna,
HashFunctions::Murmur64Protein => Murmur64Protein,
HashFunctions::Murmur64Dayhoff => Murmur64Dayhoff,
HashFunctions::Murmur64Hp => Murmur64Hp,
HashFunctions::Murmur64Skipm1n3 => Murmur64Skipm1n3,
HashFunctions::Murmur64Skipm2n3 => Murmur64Skipm2n3,
}
}
}

impl From<crate::encodings::HashFunctions> for HashFunctions {
fn from(v: crate::encodings::HashFunctions) -> HashFunctions {
use crate::encodings::HashFunctions::{
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein,
Murmur64Dayhoff, Murmur64Dna, Murmur64Hp, Murmur64Protein, Murmur64Skipm1n3,
Murmur64Skipm2n3,
};
match v {
Murmur64Dna => HashFunctions::Murmur64Dna,
Murmur64Protein => HashFunctions::Murmur64Protein,
Murmur64Dayhoff => HashFunctions::Murmur64Dayhoff,
Murmur64Hp => HashFunctions::Murmur64Hp,
Murmur64Skipm1n3 => HashFunctions::Murmur64Skipm1n3,
Murmur64Skipm2n3 => HashFunctions::Murmur64Skipm2n3,
_ => todo!("Not supported, probably custom"),
}
}
Expand Down
8 changes: 8 additions & 0 deletions src/core/src/sketch/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,14 @@ impl KmerMinHash {
self.hash_function == HashFunctions::Murmur64Hp
}

pub fn skipm1n3(&self) -> bool {
self.hash_function == HashFunctions::Murmur64Skipm1n3
}

pub fn skipm2n3(&self) -> bool {
self.hash_function == HashFunctions::Murmur64Skipm2n3
}

pub fn mins(&self) -> Vec<u64> {
self.mins.clone()
}
Expand Down
33 changes: 33 additions & 0 deletions src/sourmash/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,39 @@ def add_moltype_args(parser):
)
parser.set_defaults(hp=False)

parser.add_argument(
"--skipm1n3",
"--skipmer-m1n3",
dest="skipm1n3",
action="store_true",
help="choose skipmer (m1n3) signatures",
)

parser.add_argument(
"--no-skipm1n3",
"--no-skipmer-m1n3",
dest="skipm1n3",
action="store_false",
help="do not choose skipmer (m1n3) signatures",
)
parser.set_defaults(skipm1n3=False)

parser.add_argument(
"--skipm2n3",
"--skipmer-m2n3",
dest="skipm2n3",
action="store_true",
help="choose skipmer (m2n3) signatures",
)
parser.add_argument(
"--no-skipm2n3",
"--no-skipmer-m2n3",
dest="skipm2n3",
action="store_false",
help="do not choose skipmer (m2n3) signatures",
)
parser.set_defaults(skipm2n3=False)

parser.add_argument(
"--dna",
"--rna",
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1242,7 +1242,7 @@ def _check_select_parameters(**kw):

moltype = kw.get("moltype")
if moltype is not None:
if moltype not in ["DNA", "protein", "dayhoff", "hp"]:
if moltype not in ["DNA", "protein", "dayhoff", "hp", "skipm1n3", "skipm2n3"]:
raise ValueError(f"unknown moltype: {moltype}")

scaled = kw.get("scaled")
Expand Down
54 changes: 50 additions & 4 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ def __init__(
is_protein=False,
dayhoff=False,
hp=False,
skipm1n3=False,
skipm2n3=False,
track_abundance=False,
seed=MINHASH_DEFAULT_SEED,
max_hash=0,
Expand All @@ -215,6 +217,8 @@ def __init__(
* is_protein (default False) - aa k-mers
* dayhoff (default False) - dayhoff encoding
* hp (default False) - hydrophilic/hydrophobic aa
* skipm1n3 (default False) - skipmer (m1n3)
* skipm2n3 (default False) - skipmer (m2n3)
* track_abundance (default False) - track hash multiplicity
* mins (default None) - list of hashvals, or (hashval, abund) pairs
* seed (default 42) - murmurhash seed
Expand Down Expand Up @@ -243,6 +247,10 @@ def __init__(
elif is_protein:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
ksize = ksize * 3
elif skipm1n3:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
elif skipm2n3:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
else:
hash_function = lib.HASH_FUNCTIONS_MURMUR64_DNA

Expand All @@ -264,6 +272,8 @@ def __copy__(self):
is_protein=self.is_protein,
dayhoff=self.dayhoff,
hp=self.hp,
skipm1n3=self.skipm1n3,
skipm2n3=self.skipm2n3,
track_abundance=self.track_abundance,
seed=self.seed,
max_hash=self._max_hash,
Expand All @@ -279,12 +289,20 @@ def __getstate__(self):
# note: we multiple ksize by 3 here so that
# pickle protocols that bypass __setstate__ <coff numpy coff>
# get a ksize that makes sense to the Rust layer. See #2262.
# CTB/NTP note: if you add things below, you might want to put
# them at the end, because we use internal indexes in a few places.
# see especially `_set_num_scaled()` in sig/__main__.my.
# My apologies.
return (
self.num,
self.ksize if self.is_dna else self.ksize * 3,
self.ksize
if self.is_dna or self.skipm1n3 or self.skipm2n3
else self.ksize * 3,
self.is_protein,
self.dayhoff,
self.hp,
self.skipm1n3,
self.skipm2n3,
self.hashes,
None,
self.track_abundance,
Expand All @@ -300,6 +318,8 @@ def __setstate__(self, tup):
is_protein,
dayhoff,
hp,
skipm1n3,
skipm2n3,
mins,
_,
track_abundance,
Expand All @@ -316,6 +336,10 @@ def __setstate__(self, tup):
if hp
else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
if is_protein
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
if skipm1n3
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
if skipm2n3
else lib.HASH_FUNCTIONS_MURMUR64_DNA
)

Expand All @@ -340,6 +364,8 @@ def copy_and_clear(self):
is_protein=self.is_protein,
dayhoff=self.dayhoff,
hp=self.hp,
skipm1n3=self.skipm1n3,
skipm2n3=self.skipm2n3,
track_abundance=self.track_abundance,
seed=self.seed,
max_hash=self._max_hash,
Expand Down Expand Up @@ -461,7 +487,7 @@ def kmers_and_hashes(self, sequence, *, force=False, is_protein=False):

def add_kmer(self, kmer):
"Add a kmer into the sketch."
if self.is_dna:
if self.is_dna or self.skipm1n3 or self.skipm2n3:
if len(kmer) != self.ksize:
raise ValueError(f"kmer to add is not {self.ksize} in length")
else:
Expand Down Expand Up @@ -561,7 +587,9 @@ def scaled(self):

@property
def is_dna(self):
return not (self.is_protein or self.dayhoff or self.hp)
return not (
self.is_protein or self.dayhoff or self.hp or self.skipm1n3 or self.skipm2n3
)

@property
def is_protein(self):
Expand All @@ -575,10 +603,18 @@ def dayhoff(self):
def hp(self):
return self._methodcall(lib.kmerminhash_hp)

@property
def skipm1n3(self):
return self._methodcall(lib.kmerminhash_skipm1n3)

@property
def skipm2n3(self):
return self._methodcall(lib.kmerminhash_skipm2n3)

@property
def ksize(self):
k = self._methodcall(lib.kmerminhash_ksize)
if not self.is_dna:
if not self.is_dna and not self.skipm1n3 and not self.skipm2n3:
assert k % 3 == 0
k = int(k / 3)
return k
Expand Down Expand Up @@ -1051,6 +1087,10 @@ def moltype(self): # TODO: test in minhash tests
return "dayhoff"
elif self.hp:
return "hp"
elif self.skipm1n3:
return "skipm1n3"
elif self.skipm2n3:
return "skipm2n3"
else:
return "DNA"

Expand Down Expand Up @@ -1224,6 +1264,8 @@ def __setstate__(self, tup):
is_protein,
dayhoff,
hp,
skipm1n3,
skipm2n3,
mins,
_,
track_abundance,
Expand All @@ -1240,6 +1282,10 @@ def __setstate__(self, tup):
if hp
else lib.HASH_FUNCTIONS_MURMUR64_PROTEIN
if is_protein
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM1N3
if skipm1n3
else lib.HASH_FUNCTIONS_MURMUR64_SKIPM2N3
if skipm2n3
else lib.HASH_FUNCTIONS_MURMUR64_DNA
)

Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sig/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _set_num_scaled(mh, num, scaled):
# Number of hashes is 0th parameter
mh_params[0] = num
# Scale is 8th parameter
mh_params[8] = _get_max_hash_for_scaled(scaled)
mh_params[10] = _get_max_hash_for_scaled(scaled)
mh.__setstate__(mh_params)
assert mh.num == num
assert mh.scaled == scaled
Expand Down
12 changes: 10 additions & 2 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def check_num_bounds(arg):

def get_moltype(sig, require=False):
mh = sig.minhash
if mh.moltype in ("DNA", "dayhoff", "hp", "protein"):
if mh.moltype in ("DNA", "dayhoff", "hp", "protein", "skipm1n3", "skipm2n3"):
moltype = mh.moltype
else:
raise ValueError(f"unknown molecule type for sig {sig}")
Expand All @@ -109,9 +109,15 @@ def calculate_moltype(args, default=None):
moltype = "protein"
n += 1

if args.skipm1n3:
moltype = "skipm1n3"

if args.skipm2n3:
moltype = "skipm2n3"

if n > 1:
error(
"cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff"
"cannot specify more than one of --dna/--rna/--nucleotide/--protein/--hp/--dayhoff/--skipm1n3/--skipm2n3"
)
sys.exit(-1)

Expand Down Expand Up @@ -185,11 +191,13 @@ def load_include_exclude_db_patterns(args):

def search_pattern(vals):
return any(pattern.search(val) for val in vals)

elif args.exclude_db_pattern:
pattern = re.compile(args.exclude_db_pattern, re.IGNORECASE)

def search_pattern(vals):
return all(not pattern.search(val) for val in vals)

else:
search_pattern = None

Expand Down

0 comments on commit 150967b

Please sign in to comment.