oxli-bio · ctb · Sep 26, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 23, 2024
diff --git a/src/lib.rs b/src/lib.rs
@@ -15,6 +15,7 @@ use pyo3::prelude::*;
 use pyo3::PyResult;
 use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
+use sourmash::encodings::revcomp;
 use sourmash::encodings::HashFunctions;
 use sourmash::signature::SeqToHashes;
 
@@ -52,7 +53,7 @@ impl KmerCountTable {
     // TODO: Add function to get canonical kmer using hash key
 
     /// Turn a k-mer into a hashval.
-    fn hash_kmer(&self, kmer: String) -> Result<u64> {
+    pub fn hash_kmer(&self, kmer: String) -> Result<u64> {
         if kmer.len() as u8 != self.ksize {
             Err(anyhow!("wrong ksize"))
         } else {
@@ -503,6 +504,64 @@ impl KmerCountTable {
         Ok(())
     }
 
+    pub fn kmers_and_hashes(
+        &self,
+        seq: String,
+        skip_bad_kmers: bool,
+    ) -> PyResult<Vec<(String, u64)>> {
+        // TODO: optimize RC calculation
+        // TODO: confirm that there are no more hashes left? unreachable?
+        let seq = seq.to_ascii_uppercase();
+        let seqb = seq.as_bytes();
+
+        let mut hasher = SeqToHashes::new(
+            seqb,
+            self.ksize.into(),
+            skip_bad_kmers,
+            false,
+            HashFunctions::Murmur64Dna,
+            42,
+        );
+
+        let ksize = self.ksize as usize;
+        let end: usize = seq.len() - ksize + 1;
+
+        let mut v: Vec<(String, u64)> = vec![];
+        for start in 0..end {
+            let substr = &seq[start..start + ksize];
+            // CTB: this calculates RC each time, instead of doing so
+            // using a sliding window. It's easy and works, so I'm
+            // starting here :).
+            let substr_b_rc = revcomp(&seqb[start..start + ksize]);
+            let substr_rc =
+                std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp");
+            let hashval = hasher.next().expect("should not run out of hashes");
+
+            // Three options:
+            // * good kmer, all is well, store canonical k-mer and hashval;
+            // * bad k-mer allowed by skip_bad_kmers, and signaled by
+            //   hashval == 0): return empty string & 0;
+            // * bad k-mer not allowed, raise error
+            if let Ok(hashval) = hashval {
+                if hashval > 0 {
+                    let canonical_kmer = if substr < substr_rc {
+                        substr
+                    } else {
+                        substr_rc
+                    };
+                    v.push((canonical_kmer.to_string(), hashval));
+                } else {
+                    v.push(("".to_owned(), 0));
+                }
+            } else {
+                let msg = format!("bad k-mer at position {}: {}", start, substr);
+                return Err(PyValueError::new_err(msg));
+            }
+        }
+
+        Ok(v)
+    }
+
     /// Calculates the Jaccard Similarity Coefficient between two KmerCountTable objects.
     /// # Returns
     /// The Jaccard Similarity Coefficient between the two tables as a float value between 0 and 1.

diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py
@@ -0,0 +1,107 @@
+import pytest
+
+import oxli
+
+
+# Helper function, create tables.
+def create_sample_kmer_table(ksize, kmers):
+    table = oxli.KmerCountTable(ksize)
+    for kmer in kmers:
+        table.count(kmer)
+    return table
+
+
+def test_basic():
+    "string containing only forward canonical kmers."
+    seq = "ATAAACC"  # all forward k-mers
+    cg = oxli.KmerCountTable(ksize=4)
+
+    x = cg.kmers_and_hashes(seq, False)
+    assert x == [
+        ("ATAA", 179996601836427478),
+        ("TAAA", 15286642655859448092),
+        ("AAAC", 9097280691811734508),
+        ("AACC", 6779379503393060785),
+    ]
+
+
+def test_basic_rc():
+    "string containing only reverse canonical kmers."
+    seq = "GGTTTAT"
+    cg = oxli.KmerCountTable(ksize=4)
+
+    x = cg.kmers_and_hashes(seq, False)
+    print(x)
+    assert x == [
+        ("AACC", 6779379503393060785),
+        ("AAAC", 9097280691811734508),
+        ("TAAA", 15286642655859448092),
+        ("ATAA", 179996601836427478),
+    ]
+
+
+def test_basic_mixed():
+    "string containing forward and reverse canonical kmers."
+    seq = "ACGTTG"
+    cg = oxli.KmerCountTable(ksize=4)
+
+    x = cg.kmers_and_hashes(seq, False)
+    print(x)
+    assert x == [
+        ("ACGT", 2597925387403686983),
+        ("AACG", 7952982457453691616),
+        ("CAAC", 7315150081962684964),
+    ]
+
+    for kmer, hashval_rs in x:
+        print(kmer, hashval_rs, cg.hash_kmer(kmer))
+        assert cg.hash_kmer(kmer) == hashval_rs
+
+
+def test_basic_lower():
+    "Test that sequences are turned into uppercase appropriately."
+    seq = "acgttg"
+    cg = oxli.KmerCountTable(ksize=4)
+
+    x = cg.kmers_and_hashes(seq, False)
+    print(x)
+    assert x == [
+        ("ACGT", 2597925387403686983),
+        ("AACG", 7952982457453691616),
+        ("CAAC", 7315150081962684964),
+    ]
+
+
+def test_bad_kmers_raise_error():
+    "Test that bad k-mers raise a ValueError with info"
+    seq = "acxttg"
+    cg = oxli.KmerCountTable(ksize=4)
+
+    with pytest.raises(ValueError, match="bad k-mer at position 0: ACXT"):
+        x = cg.kmers_and_hashes(seq, False)
+
+
+def test_bad_kmers_raise_error_2():
+    "Test bad k-mers raise the right error even when not at beginning :)"
+    seq = "aattxttgg"
+    cg = oxli.KmerCountTable(ksize=4)
+
+    with pytest.raises(ValueError, match="bad k-mer at position 1: ATTX"):
+        x = cg.kmers_and_hashes(seq, False)
+
+
+def test_bad_kmers_allowed():
+    "Test that bad k-mers are allowed when skip_bad_kmers is True"
+    seq = "aattxttgg"
+    cg = oxli.KmerCountTable(ksize=4)
+
+    x = cg.kmers_and_hashes(seq, True)
+    print(x)
+    assert x == [
+        ("AATT", 382727017318141683),
+        ("", 0),
+        ("", 0),
+        ("", 0),
+        ("", 0),
+        ("CCAA", 1798905482136869687),
+    ]