Skip to content

Commit

Permalink
WIP: skipmer improvements (#3415)
Browse files Browse the repository at this point in the history
Make skipmers robust, but keep #3395 functional in the meantime.

This PR:
- enables second skipmer types, so we have m1n3 in addition to m2n3
- switches to a reading frame approach for both translation + skipmers,
which means we first build the reading frame, then kmerize, rather than
building kmers + translating/skipping on the fly
- avoids "extended length" needed for skipping on the fly

Since this changes the `SeqToHashes` strategy a bit, there's one python
test where we now see a different error.

Future thoughts:
- with the new structure, it would be straightforward to add validation
to exclude protein k-mers with invalid amino acids (`X`). I guess I'm
not entirely sure what happens to those atm...
  • Loading branch information
bluegenes authored Dec 12, 2024
1 parent d7f59cf commit 96aea47
Show file tree
Hide file tree
Showing 5 changed files with 707 additions and 272 deletions.
27 changes: 24 additions & 3 deletions src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ pub struct ComputeParameters {

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
skipmer: bool,
skipm1n3: bool,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
skipm2n3: bool,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
Expand Down Expand Up @@ -169,12 +173,29 @@ pub fn build_template(params: &ComputeParameters) -> Vec<Sketch> {
));
}

if params.skipmer {
if params.skipm1n3 {
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::Murmur64Skipm1n3)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Some(Default::default())
} else {
None
})
.build(),
));
}

if params.skipm2n3 {
ksigs.push(Sketch::LargeMinHash(
KmerMinHashBTree::builder()
.num(params.num_hashes)
.ksize(*k)
.hash_function(HashFunctions::Murmur64Skipmer)
.hash_function(HashFunctions::Murmur64Skipm2n3)
.max_hash(max_hash)
.seed(params.seed)
.abunds(if params.track_abundance {
Expand Down
18 changes: 13 additions & 5 deletions src/core/src/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ pub enum HashFunctions {
Murmur64Protein,
Murmur64Dayhoff,
Murmur64Hp,
Murmur64Skipmer,
Murmur64Skipm1n3,
Murmur64Skipm2n3,
Custom(String),
}

Expand All @@ -51,8 +52,13 @@ impl HashFunctions {
pub fn hp(&self) -> bool {
*self == HashFunctions::Murmur64Hp
}
pub fn skipmer(&self) -> bool {
*self == HashFunctions::Murmur64Skipmer

pub fn skipm1n3(&self) -> bool {
*self == HashFunctions::Murmur64Skipm1n3
}

pub fn skipm2n3(&self) -> bool {
*self == HashFunctions::Murmur64Skipm2n3
}
}

Expand All @@ -66,7 +72,8 @@ impl std::fmt::Display for HashFunctions {
HashFunctions::Murmur64Protein => "protein",
HashFunctions::Murmur64Dayhoff => "dayhoff",
HashFunctions::Murmur64Hp => "hp",
HashFunctions::Murmur64Skipmer => "skipmer",
HashFunctions::Murmur64Skipm1n3 => "skipm1n3",
HashFunctions::Murmur64Skipm2n3 => "skipm2n3",
HashFunctions::Custom(v) => v,
}
)
Expand All @@ -82,7 +89,8 @@ impl TryFrom<&str> for HashFunctions {
"dayhoff" => Ok(HashFunctions::Murmur64Dayhoff),
"hp" => Ok(HashFunctions::Murmur64Hp),
"protein" => Ok(HashFunctions::Murmur64Protein),
"skipmer" => Ok(HashFunctions::Murmur64Skipmer),
"skipm1n3" => Ok(HashFunctions::Murmur64Skipm1n3),
"skipm2n3" => Ok(HashFunctions::Murmur64Skipm2n3),
v => unimplemented!("{v}"),
}
}
Expand Down
Loading

0 comments on commit 96aea47

Please sign in to comment.