From e7796950581f5c344dbd300fb3199a60fae4c922 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 18 Sep 2024 08:10:36 -0700 Subject: [PATCH 01/11] fn for kmers_and_hashes --- src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 7dee715..2e1fa6f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -387,6 +387,10 @@ impl KmerCountTable { self.counts.insert(hashval, count); Ok(()) } + + pub fn kmers_and_hashes(&self, seq: String, allow_bad_kmers: bool) -> PyResult> { + Ok(vec![]) + } } // Iterator implementation for KmerCountTable From 8770d9ead63585fe15b68ffff2f6e8052a92ceb1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 18 Sep 2024 08:38:59 -0700 Subject: [PATCH 02/11] finish basic implementation --- src/lib.rs | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2e1fa6f..2a90666 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,7 +7,7 @@ use anyhow::{anyhow, Result}; use log::debug; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use sourmash::encodings::HashFunctions; +use sourmash::encodings::{revcomp, HashFunctions}; use sourmash::signature::SeqToHashes; // Set version variable @@ -388,8 +388,50 @@ impl KmerCountTable { Ok(()) } - pub fn kmers_and_hashes(&self, seq: String, allow_bad_kmers: bool) -> PyResult> { - Ok(vec![]) + pub fn kmers_and_hashes( + &self, + seq: String, + allow_bad_kmers: bool, + ) -> PyResult> { + let seqb = seq.as_bytes(); + let mut hasher = SeqToHashes::new( + seqb, + self.ksize.into(), + allow_bad_kmers, + false, + HashFunctions::Murmur64Dna, + 42, + ); + + let ksize = self.ksize as usize; + let end: usize = seq.len() - ksize + 1; + + let mut v: Vec<(String, u64)> = vec![]; + for start in 0..end { + let substr = &seq[start..start + ksize]; + let hashval = hasher.next().unwrap().unwrap(); + v.push((substr.to_string(), hashval)); + } + /* + let seqb_rc = revcomp(&seqb); + let seq_rc = String::from_utf8(seqb_rc.clone()).unwrap(); + + let mut hasher = SeqToHashes::new( + &seqb_rc, + self.ksize.into(), + allow_bad_kmers, + false, + HashFunctions::Murmur64Dna, + 42, + ); + for start in 0..end { + let substr = &seq_rc[start..start + ksize]; + let hashval = hasher.next().unwrap().unwrap(); + v.push((substr.to_string(), hashval)); + } + */ + + Ok(v) } } From 0f2c535a447ff5710fc55713d3b6852ca49911ea Mon Sep 17 00:00:00 2001 From: Adam Taranto Date: Mon, 23 Sep 2024 19:52:13 +1000 Subject: [PATCH 03/11] cargo fmt --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 879273e..fbacf9a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,7 +15,7 @@ use pyo3::prelude::*; use pyo3::PyResult; use rayon::prelude::*; use serde::{Deserialize, Serialize}; -use sourmash::encodings::{HashFunctions}; +use sourmash::encodings::HashFunctions; //use sourmash::encodings::revcomp; use sourmash::signature::SeqToHashes; From bf83c042d1e0c24f8d79ae814d3a6f9413958422 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 24 Sep 2024 12:11:43 -0700 Subject: [PATCH 04/11] basic tests --- src/lib.rs | 41 ++++++------- src/python/tests/test_kmers_and_hashes.py | 74 +++++++++++++++++++++++ 2 files changed, 94 insertions(+), 21 deletions(-) create mode 100644 src/python/tests/test_kmers_and_hashes.py diff --git a/src/lib.rs b/src/lib.rs index fbacf9a..1ac708a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,8 +15,8 @@ use pyo3::prelude::*; use pyo3::PyResult; use rayon::prelude::*; use serde::{Deserialize, Serialize}; +use sourmash::encodings::revcomp; use sourmash::encodings::HashFunctions; -//use sourmash::encodings::revcomp; use sourmash::signature::SeqToHashes; // Set version variable @@ -506,7 +506,11 @@ impl KmerCountTable { seq: String, allow_bad_kmers: bool, ) -> PyResult> { + // TODO: optimize RC calculation + // TODO: write lots of tests! + let seq = seq.to_ascii_uppercase(); let seqb = seq.as_bytes(); + let mut hasher = SeqToHashes::new( seqb, self.ksize.into(), @@ -522,27 +526,22 @@ impl KmerCountTable { let mut v: Vec<(String, u64)> = vec![]; for start in 0..end { let substr = &seq[start..start + ksize]; - let hashval = hasher.next().unwrap().unwrap(); - v.push((substr.to_string(), hashval)); + let substr_b_rc = revcomp(&seqb[start..start + ksize]); + let substr_rc = + std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp"); + let hashval = hasher + .next() + .expect("should not run out of hashes"); + + let canonical_kmer = if substr < substr_rc { + substr + } else { + eprintln!("choosing revcomp!!"); + substr_rc + }; + + v.push((canonical_kmer.to_string(), hashval)); } - /* - let seqb_rc = revcomp(&seqb); - let seq_rc = String::from_utf8(seqb_rc.clone()).unwrap(); - - let mut hasher = SeqToHashes::new( - &seqb_rc, - self.ksize.into(), - allow_bad_kmers, - false, - HashFunctions::Murmur64Dna, - 42, - ); - for start in 0..end { - let substr = &seq_rc[start..start + ksize]; - let hashval = hasher.next().unwrap().unwrap(); - v.push((substr.to_string(), hashval)); - } - */ Ok(v) } diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py new file mode 100644 index 0000000..8d2d881 --- /dev/null +++ b/src/python/tests/test_kmers_and_hashes.py @@ -0,0 +1,74 @@ +import pytest + +import oxli + + +# Helper function, create tables. +def create_sample_kmer_table(ksize, kmers): + table = oxli.KmerCountTable(ksize) + for kmer in kmers: + table.count(kmer) + return table + + +def test_basic(): + "string containing only forward canonical kmers." + seq = "ATAAACC" # all forward k-mers + cg = oxli.KmerCountTable(ksize=4) + + x = cg.kmers_and_hashes(seq, False) + assert x == [('ATAA', 179996601836427478), + ('TAAA', 15286642655859448092), + ('AAAC', 9097280691811734508), + ('AACC', 6779379503393060785)] + + +def test_basic_rc(): + "string containing only reverse canonical kmers." + seq = "GGTTTAT" + cg = oxli.KmerCountTable(ksize=4) + + x = cg.kmers_and_hashes(seq, False) + print(x) + assert x == [('AACC', 6779379503393060785), + ('AAAC', 9097280691811734508), + ('TAAA', 15286642655859448092), + ('ATAA', 179996601836427478)] + + +def test_basic_mixed(): + "string containing forward and reverse canonical kmers." + seq = "ACGTTG" + cg = oxli.KmerCountTable(ksize=4) + + x = cg.kmers_and_hashes(seq, False) + print(x) + assert x == [('ACGT', 2597925387403686983), + ('AACG', 7952982457453691616), + ('CAAC', 7315150081962684964)] + + +def test_basic_lower(): + "Test that sequences are turned into uppercase appropriately." + seq = "acgttg" + cg = oxli.KmerCountTable(ksize=4) + + x = cg.kmers_and_hashes(seq, False) + print(x) + assert x == [('ACGT', 2597925387403686983), + ('AACG', 7952982457453691616), + ('CAAC', 7315150081962684964)] + + +def test_bad_kmers(): + "Test that sequences are turned into uppercase appropriately." + seq = "acxttg" + cg = oxli.KmerCountTable(ksize=4) + + # CTB: would be nice to turn this into a better error message, + # ideally one containing the bad k-mer and/or location. + try: + x = cg.kmers_and_hashes(seq, False) + assert False, "this should fail" + except: + pass From 49bb3b63402251c497ab09bdaa1a1eb9545dda6d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 24 Sep 2024 12:22:34 -0700 Subject: [PATCH 05/11] much improved code --- src/lib.rs | 24 +++++++++++++++-------- src/python/tests/test_kmers_and_hashes.py | 14 ++++++++++++- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1ac708a..74fa1d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -507,7 +507,7 @@ impl KmerCountTable { allow_bad_kmers: bool, ) -> PyResult> { // TODO: optimize RC calculation - // TODO: write lots of tests! + // TODO: confirm that there are no more hashes left? unreachable? let seq = seq.to_ascii_uppercase(); let seqb = seq.as_bytes(); @@ -533,14 +533,22 @@ impl KmerCountTable { .next() .expect("should not run out of hashes"); - let canonical_kmer = if substr < substr_rc { - substr + if let Ok(hashval) = hashval { + if hashval > 0 { + let canonical_kmer = if substr < substr_rc { + substr + } else { + eprintln!("choosing revcomp!!"); + substr_rc + }; + v.push((canonical_kmer.to_string(), hashval)); + } else { + v.push(("".to_owned(), 0)); + } } else { - eprintln!("choosing revcomp!!"); - substr_rc - }; - - v.push((canonical_kmer.to_string(), hashval)); + let msg = format!("bad k-mer at position: {}", start); + return Err(PyValueError::new_err(msg)); + } } Ok(v) diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py index 8d2d881..530eb2c 100644 --- a/src/python/tests/test_kmers_and_hashes.py +++ b/src/python/tests/test_kmers_and_hashes.py @@ -60,7 +60,7 @@ def test_basic_lower(): ('CAAC', 7315150081962684964)] -def test_bad_kmers(): +def test_bad_kmers_raise_error(): "Test that sequences are turned into uppercase appropriately." seq = "acxttg" cg = oxli.KmerCountTable(ksize=4) @@ -72,3 +72,15 @@ def test_bad_kmers(): assert False, "this should fail" except: pass + + +def test_bad_kmers_allowed(): + "Test that sequences are turned into uppercase appropriately." + seq = "aattxttgg" + cg = oxli.KmerCountTable(ksize=4) + + x = cg.kmers_and_hashes(seq, True) + print(x) + assert x == [('AATT', 382727017318141683), + ('', 0), ('', 0), ('', 0), ('', 0), + ('CCAA', 1798905482136869687)] From 95a5965c7ac510c386da3e997694820188506bdb Mon Sep 17 00:00:00 2001 From: ctb Date: Wed, 25 Sep 2024 00:30:39 +0000 Subject: [PATCH 06/11] Style fixes by Ruff --- src/python/tests/test_kmers_and_hashes.py | 49 ++++++++++++++--------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py index 530eb2c..a5bc5b3 100644 --- a/src/python/tests/test_kmers_and_hashes.py +++ b/src/python/tests/test_kmers_and_hashes.py @@ -13,14 +13,16 @@ def create_sample_kmer_table(ksize, kmers): def test_basic(): "string containing only forward canonical kmers." - seq = "ATAAACC" # all forward k-mers + seq = "ATAAACC" # all forward k-mers cg = oxli.KmerCountTable(ksize=4) x = cg.kmers_and_hashes(seq, False) - assert x == [('ATAA', 179996601836427478), - ('TAAA', 15286642655859448092), - ('AAAC', 9097280691811734508), - ('AACC', 6779379503393060785)] + assert x == [ + ("ATAA", 179996601836427478), + ("TAAA", 15286642655859448092), + ("AAAC", 9097280691811734508), + ("AACC", 6779379503393060785), + ] def test_basic_rc(): @@ -30,10 +32,12 @@ def test_basic_rc(): x = cg.kmers_and_hashes(seq, False) print(x) - assert x == [('AACC', 6779379503393060785), - ('AAAC', 9097280691811734508), - ('TAAA', 15286642655859448092), - ('ATAA', 179996601836427478)] + assert x == [ + ("AACC", 6779379503393060785), + ("AAAC", 9097280691811734508), + ("TAAA", 15286642655859448092), + ("ATAA", 179996601836427478), + ] def test_basic_mixed(): @@ -43,9 +47,11 @@ def test_basic_mixed(): x = cg.kmers_and_hashes(seq, False) print(x) - assert x == [('ACGT', 2597925387403686983), - ('AACG', 7952982457453691616), - ('CAAC', 7315150081962684964)] + assert x == [ + ("ACGT", 2597925387403686983), + ("AACG", 7952982457453691616), + ("CAAC", 7315150081962684964), + ] def test_basic_lower(): @@ -55,9 +61,11 @@ def test_basic_lower(): x = cg.kmers_and_hashes(seq, False) print(x) - assert x == [('ACGT', 2597925387403686983), - ('AACG', 7952982457453691616), - ('CAAC', 7315150081962684964)] + assert x == [ + ("ACGT", 2597925387403686983), + ("AACG", 7952982457453691616), + ("CAAC", 7315150081962684964), + ] def test_bad_kmers_raise_error(): @@ -81,6 +89,11 @@ def test_bad_kmers_allowed(): x = cg.kmers_and_hashes(seq, True) print(x) - assert x == [('AATT', 382727017318141683), - ('', 0), ('', 0), ('', 0), ('', 0), - ('CCAA', 1798905482136869687)] + assert x == [ + ("AATT", 382727017318141683), + ("", 0), + ("", 0), + ("", 0), + ("", 0), + ("CCAA", 1798905482136869687), + ] From 12ab55b94889af4515548135f6e253472fd14e6f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 25 Sep 2024 06:31:52 -0700 Subject: [PATCH 07/11] rename allow_bad_kmers to skip_bad_kmers --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 61d9534..866103b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -507,7 +507,7 @@ impl KmerCountTable { pub fn kmers_and_hashes( &self, seq: String, - allow_bad_kmers: bool, + skip_bad_kmers: bool, ) -> PyResult> { // TODO: optimize RC calculation // TODO: confirm that there are no more hashes left? unreachable? @@ -517,7 +517,7 @@ impl KmerCountTable { let mut hasher = SeqToHashes::new( seqb, self.ksize.into(), - allow_bad_kmers, + skip_bad_kmers, false, HashFunctions::Murmur64Dna, 42, From 2472659e3d86f63be708aae93cbad59dbe75e614 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 25 Sep 2024 06:41:23 -0700 Subject: [PATCH 08/11] expand & test error handling --- src/lib.rs | 12 ++++++++++-- src/python/tests/test_kmers_and_hashes.py | 20 ++++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 866103b..ba789a9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -529,6 +529,9 @@ impl KmerCountTable { let mut v: Vec<(String, u64)> = vec![]; for start in 0..end { let substr = &seq[start..start + ksize]; + // CTB: this calculates RC each time, instead of doing so + // using a sliding window. It's easy and works, so I'm + // starting here :). let substr_b_rc = revcomp(&seqb[start..start + ksize]); let substr_rc = std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp"); @@ -536,12 +539,16 @@ impl KmerCountTable { .next() .expect("should not run out of hashes"); + // Three options: + // * good kmer, all is well, store canonical k-mer and hashval; + // * bad k-mer allowed by skip_bad_kmers, and signaled by + // hashval == 0): return empty string & 0; + // * bad k-mer not allowed, raise error if let Ok(hashval) = hashval { if hashval > 0 { let canonical_kmer = if substr < substr_rc { substr } else { - eprintln!("choosing revcomp!!"); substr_rc }; v.push((canonical_kmer.to_string(), hashval)); @@ -549,7 +556,8 @@ impl KmerCountTable { v.push(("".to_owned(), 0)); } } else { - let msg = format!("bad k-mer at position: {}", start); + let msg = format!("bad k-mer at position {}: {}", + start, substr); return Err(PyValueError::new_err(msg)); } } diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py index a5bc5b3..20be523 100644 --- a/src/python/tests/test_kmers_and_hashes.py +++ b/src/python/tests/test_kmers_and_hashes.py @@ -69,21 +69,25 @@ def test_basic_lower(): def test_bad_kmers_raise_error(): - "Test that sequences are turned into uppercase appropriately." + "Test that bad k-mers raise a ValueError with info" seq = "acxttg" cg = oxli.KmerCountTable(ksize=4) - # CTB: would be nice to turn this into a better error message, - # ideally one containing the bad k-mer and/or location. - try: + with pytest.raises(ValueError, match='bad k-mer at position 0: ACXT'): + x = cg.kmers_and_hashes(seq, False) + + +def test_bad_kmers_raise_error_2(): + "Test bad k-mers raise the right error even when not at beginning :)" + seq = "aattxttgg" + cg = oxli.KmerCountTable(ksize=4) + + with pytest.raises(ValueError, match='bad k-mer at position 1: ATTX'): x = cg.kmers_and_hashes(seq, False) - assert False, "this should fail" - except: - pass def test_bad_kmers_allowed(): - "Test that sequences are turned into uppercase appropriately." + "Test that bad k-mers are allowed when skip_bad_kmers is True" seq = "aattxttgg" cg = oxli.KmerCountTable(ksize=4) From ec51e8bc6c9104d4f69e734784c5959128e7cf89 Mon Sep 17 00:00:00 2001 From: ctb Date: Wed, 25 Sep 2024 13:41:51 +0000 Subject: [PATCH 09/11] Style fixes by Ruff --- src/python/tests/test_kmers_and_hashes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py index 20be523..8010c9a 100644 --- a/src/python/tests/test_kmers_and_hashes.py +++ b/src/python/tests/test_kmers_and_hashes.py @@ -73,7 +73,7 @@ def test_bad_kmers_raise_error(): seq = "acxttg" cg = oxli.KmerCountTable(ksize=4) - with pytest.raises(ValueError, match='bad k-mer at position 0: ACXT'): + with pytest.raises(ValueError, match="bad k-mer at position 0: ACXT"): x = cg.kmers_and_hashes(seq, False) @@ -82,7 +82,7 @@ def test_bad_kmers_raise_error_2(): seq = "aattxttgg" cg = oxli.KmerCountTable(ksize=4) - with pytest.raises(ValueError, match='bad k-mer at position 1: ATTX'): + with pytest.raises(ValueError, match="bad k-mer at position 1: ATTX"): x = cg.kmers_and_hashes(seq, False) From 91a1260fc913b2bf0ebfa9f5de3953dfcfba3feb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 25 Sep 2024 06:47:11 -0700 Subject: [PATCH 10/11] add more explicit test for kmers <=> hash vals in kmers_and_hashes --- src/lib.rs | 2 +- src/python/tests/test_kmers_and_hashes.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index ba789a9..4c8e4e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -53,7 +53,7 @@ impl KmerCountTable { // TODO: Add function to get canonical kmer using hash key /// Turn a k-mer into a hashval. - fn hash_kmer(&self, kmer: String) -> Result { + pub fn hash_kmer(&self, kmer: String) -> Result { if kmer.len() as u8 != self.ksize { Err(anyhow!("wrong ksize")) } else { diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py index 20be523..aec7cb2 100644 --- a/src/python/tests/test_kmers_and_hashes.py +++ b/src/python/tests/test_kmers_and_hashes.py @@ -53,6 +53,10 @@ def test_basic_mixed(): ("CAAC", 7315150081962684964), ] + for kmer, hashval_rs in x: + print(kmer, hashval_rs, cg.hash_kmer(kmer)) + assert cg.hash_kmer(kmer) == hashval_rs + def test_basic_lower(): "Test that sequences are turned into uppercase appropriately." From 6ad6e9963a6d71f04d9b18d1cd16c86b3f0f52f0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 25 Sep 2024 06:47:29 -0700 Subject: [PATCH 11/11] cargo fmt --- src/lib.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4c8e4e2..0ca4c29 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -535,9 +535,7 @@ impl KmerCountTable { let substr_b_rc = revcomp(&seqb[start..start + ksize]); let substr_rc = std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp"); - let hashval = hasher - .next() - .expect("should not run out of hashes"); + let hashval = hasher.next().expect("should not run out of hashes"); // Three options: // * good kmer, all is well, store canonical k-mer and hashval; @@ -556,8 +554,7 @@ impl KmerCountTable { v.push(("".to_owned(), 0)); } } else { - let msg = format!("bad k-mer at position {}: {}", - start, substr); + let msg = format!("bad k-mer at position {}: {}", start, substr); return Err(PyValueError::new_err(msg)); } }