-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
MRG: add kmers_and_hashes
method to get canonical kmers + hashes
#40
Changes from all commits
e779695
8770d9e
39af0c8
0f2c535
3e3125d
55d278b
bf83c04
49bb3b6
dabd3a5
a8a07be
95a5965
0833c1f
12ab55b
2472659
ec51e8b
91a1260
529a3a5
6ad6e99
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ use pyo3::prelude::*; | |
use pyo3::PyResult; | ||
use rayon::prelude::*; | ||
use serde::{Deserialize, Serialize}; | ||
use sourmash::encodings::revcomp; | ||
use sourmash::encodings::HashFunctions; | ||
use sourmash::signature::SeqToHashes; | ||
|
||
|
@@ -52,7 +53,7 @@ impl KmerCountTable { | |
// TODO: Add function to get canonical kmer using hash key | ||
|
||
/// Turn a k-mer into a hashval. | ||
fn hash_kmer(&self, kmer: String) -> Result<u64> { | ||
pub fn hash_kmer(&self, kmer: String) -> Result<u64> { | ||
if kmer.len() as u8 != self.ksize { | ||
Err(anyhow!("wrong ksize")) | ||
} else { | ||
|
@@ -503,6 +504,64 @@ impl KmerCountTable { | |
Ok(()) | ||
} | ||
|
||
pub fn kmers_and_hashes( | ||
&self, | ||
seq: String, | ||
skip_bad_kmers: bool, | ||
) -> PyResult<Vec<(String, u64)>> { | ||
// TODO: optimize RC calculation | ||
// TODO: confirm that there are no more hashes left? unreachable? | ||
let seq = seq.to_ascii_uppercase(); | ||
let seqb = seq.as_bytes(); | ||
|
||
let mut hasher = SeqToHashes::new( | ||
seqb, | ||
self.ksize.into(), | ||
skip_bad_kmers, | ||
false, | ||
HashFunctions::Murmur64Dna, | ||
42, | ||
); | ||
|
||
let ksize = self.ksize as usize; | ||
let end: usize = seq.len() - ksize + 1; | ||
|
||
let mut v: Vec<(String, u64)> = vec![]; | ||
for start in 0..end { | ||
let substr = &seq[start..start + ksize]; | ||
ctb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// CTB: this calculates RC each time, instead of doing so | ||
// using a sliding window. It's easy and works, so I'm | ||
// starting here :). | ||
let substr_b_rc = revcomp(&seqb[start..start + ksize]); | ||
let substr_rc = | ||
std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp"); | ||
let hashval = hasher.next().expect("should not run out of hashes"); | ||
|
||
// Three options: | ||
// * good kmer, all is well, store canonical k-mer and hashval; | ||
// * bad k-mer allowed by skip_bad_kmers, and signaled by | ||
// hashval == 0): return empty string & 0; | ||
// * bad k-mer not allowed, raise error | ||
if let Ok(hashval) = hashval { | ||
if hashval > 0 { | ||
let canonical_kmer = if substr < substr_rc { | ||
substr | ||
} else { | ||
substr_rc | ||
}; | ||
v.push((canonical_kmer.to_string(), hashval)); | ||
Comment on lines
+535
to
+552
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does sourmash have a canonicalize function we can recycle? Else pop out as own function to make this cleaner? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't see anything in |
||
} else { | ||
v.push(("".to_owned(), 0)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It works, but I am uneasy about hoping that hasher output and seq slices stay in sync. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, but also, that's why we have tests :). With the way Rust behaves, and with reading the code, it's hard for me to find a situation where this misbehaves. Medium term => definitely want to expose a better API. If we get the nicer code working here (iterator, in particular) it is easier to transplant to sourmash. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (and to be clear, this is something I've wanted for sourmash for a while - right now we extract the hashes and kmers in Python, which is unbearably slow! So definitely want this in the sourmash library. Just need it to be pretty general, e.g. including proteins; and that codebase is also more complex to modify and requires approval on the PR, which takes time.) |
||
} | ||
} else { | ||
let msg = format!("bad k-mer at position {}: {}", start, substr); | ||
return Err(PyValueError::new_err(msg)); | ||
} | ||
} | ||
|
||
Ok(v) | ||
ctb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
/// Calculates the Jaccard Similarity Coefficient between two KmerCountTable objects. | ||
/// # Returns | ||
/// The Jaccard Similarity Coefficient between the two tables as a float value between 0 and 1. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
import pytest | ||
|
||
import oxli | ||
|
||
|
||
# Helper function, create tables. | ||
def create_sample_kmer_table(ksize, kmers): | ||
table = oxli.KmerCountTable(ksize) | ||
for kmer in kmers: | ||
table.count(kmer) | ||
return table | ||
|
||
|
||
def test_basic(): | ||
"string containing only forward canonical kmers." | ||
seq = "ATAAACC" # all forward k-mers | ||
cg = oxli.KmerCountTable(ksize=4) | ||
|
||
x = cg.kmers_and_hashes(seq, False) | ||
assert x == [ | ||
("ATAA", 179996601836427478), | ||
("TAAA", 15286642655859448092), | ||
("AAAC", 9097280691811734508), | ||
("AACC", 6779379503393060785), | ||
] | ||
|
||
|
||
def test_basic_rc(): | ||
"string containing only reverse canonical kmers." | ||
seq = "GGTTTAT" | ||
cg = oxli.KmerCountTable(ksize=4) | ||
|
||
x = cg.kmers_and_hashes(seq, False) | ||
print(x) | ||
assert x == [ | ||
("AACC", 6779379503393060785), | ||
("AAAC", 9097280691811734508), | ||
("TAAA", 15286642655859448092), | ||
("ATAA", 179996601836427478), | ||
] | ||
|
||
|
||
def test_basic_mixed(): | ||
"string containing forward and reverse canonical kmers." | ||
seq = "ACGTTG" | ||
cg = oxli.KmerCountTable(ksize=4) | ||
|
||
x = cg.kmers_and_hashes(seq, False) | ||
print(x) | ||
assert x == [ | ||
("ACGT", 2597925387403686983), | ||
("AACG", 7952982457453691616), | ||
("CAAC", 7315150081962684964), | ||
] | ||
|
||
for kmer, hashval_rs in x: | ||
print(kmer, hashval_rs, cg.hash_kmer(kmer)) | ||
assert cg.hash_kmer(kmer) == hashval_rs | ||
|
||
|
||
def test_basic_lower(): | ||
"Test that sequences are turned into uppercase appropriately." | ||
seq = "acgttg" | ||
cg = oxli.KmerCountTable(ksize=4) | ||
|
||
x = cg.kmers_and_hashes(seq, False) | ||
print(x) | ||
assert x == [ | ||
("ACGT", 2597925387403686983), | ||
("AACG", 7952982457453691616), | ||
("CAAC", 7315150081962684964), | ||
] | ||
|
||
|
||
def test_bad_kmers_raise_error(): | ||
"Test that bad k-mers raise a ValueError with info" | ||
seq = "acxttg" | ||
cg = oxli.KmerCountTable(ksize=4) | ||
|
||
with pytest.raises(ValueError, match="bad k-mer at position 0: ACXT"): | ||
x = cg.kmers_and_hashes(seq, False) | ||
|
||
|
||
def test_bad_kmers_raise_error_2(): | ||
"Test bad k-mers raise the right error even when not at beginning :)" | ||
seq = "aattxttgg" | ||
cg = oxli.KmerCountTable(ksize=4) | ||
|
||
with pytest.raises(ValueError, match="bad k-mer at position 1: ATTX"): | ||
x = cg.kmers_and_hashes(seq, False) | ||
|
||
|
||
def test_bad_kmers_allowed(): | ||
"Test that bad k-mers are allowed when skip_bad_kmers is True" | ||
seq = "aattxttgg" | ||
cg = oxli.KmerCountTable(ksize=4) | ||
|
||
x = cg.kmers_and_hashes(seq, True) | ||
print(x) | ||
assert x == [ | ||
("AATT", 382727017318141683), | ||
("", 0), | ||
("", 0), | ||
("", 0), | ||
("", 0), | ||
("CCAA", 1798905482136869687), | ||
] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the full contig/chromosome in memory twice at this point?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no, it's a view (via the reference,
&
, see as_bytes).If you calculate it, the memory consumption of the underlying string is minimal compared to catastrophic expansion of it into k-mers, even for human-size chromosomes.