From f211b6debaf35e37258cb59f6ee71a2c46411022 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 22:07:21 +1100 Subject: [PATCH 1/7] Bump serde_json from 1.0.128 to 1.0.132 (#78) Bumps [serde_json](https://github.com/serde-rs/json) from 1.0.128 to 1.0.132. - [Release notes](https://github.com/serde-rs/json/releases) - [Commits](https://github.com/serde-rs/json/compare/1.0.128...1.0.132) --- updated-dependencies: - dependency-name: serde_json dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8463986..4d2ec70 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1076,9 +1076,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.128" +version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ff5456707a1de34e7e37f2a6fd3d3f808c318259cbd01ab6377795054b483d8" +checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "itoa", "memchr", diff --git a/Cargo.toml b/Cargo.toml index b1b5f47..9f695ed 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,5 +16,5 @@ niffler = "2.6.0" pyo3 = { version="0.22.4", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } -serde_json = "1.0.128" +serde_json = "1.0.132" sourmash = "0.15.2" \ No newline at end of file From 05e2dee9e4fc8dc20109d78641b3343302898dcc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 22:15:24 +1100 Subject: [PATCH 2/7] Bump pyo3 from 0.22.4 to 0.22.5 (#79) Bumps [pyo3](https://github.com/pyo3/pyo3) from 0.22.4 to 0.22.5. - [Release notes](https://github.com/pyo3/pyo3/releases) - [Changelog](https://github.com/PyO3/pyo3/blob/main/CHANGELOG.md) - [Commits](https://github.com/pyo3/pyo3/compare/v0.22.4...v0.22.5) --- updated-dependencies: - dependency-name: pyo3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Adam Taranto --- Cargo.lock | 20 ++++++++++---------- Cargo.toml | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4d2ec70..f27edf7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -857,9 +857,9 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00e89ce2565d6044ca31a3eb79a334c3a79a841120a98f64eea9f579564cb691" +checksum = "3d922163ba1f79c04bc49073ba7b32fd5a8d3b76a87c955921234b8e77333c51" dependencies = [ "anyhow", "cfg-if", @@ -876,9 +876,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8afbaf3abd7325e08f35ffb8deb5892046fcb2608b703db6a583a5ba4cea01e" +checksum = "bc38c5feeb496c8321091edf3d63e9a6829eab4b863b4a6a65f26f3e9cc6b179" dependencies = [ "once_cell", "target-lexicon", @@ -886,9 +886,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec15a5ba277339d04763f4c23d85987a5b08cbb494860be141e6a10a8eb88022" +checksum = "94845622d88ae274d2729fcefc850e63d7a3ddff5e3ce11bd88486db9f1d357d" dependencies = [ "libc", "pyo3-build-config", @@ -896,9 +896,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15e0f01b5364bcfbb686a52fc4181d412b708a68ed20c330db9fc8d2c2bf5a43" +checksum = "e655aad15e09b94ffdb3ce3d217acf652e26bbc37697ef012f5e5e348c716e5e" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -908,9 +908,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.22.4" +version = "0.22.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a09b550200e1e5ed9176976d0060cbc2ea82dc8515da07885e7b8153a85caacb" +checksum = "ae1e3f09eecd94618f60a455a23def79f79eba4dc561a97324bf9ac8c6df30ce" dependencies = [ "heck 0.5.0", "proc-macro2", diff --git a/Cargo.toml b/Cargo.toml index 9f695ed..84b9cff 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ anyhow = "1.0.89" env_logger = "0.11.5" log = "0.4.22" niffler = "2.6.0" -pyo3 = { version="0.22.4", features = ["extension-module", "anyhow"] } +pyo3 = { version="0.22.5", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } serde_json = "1.0.132" From 30403e2da14d7d88acdf6148c7ee3d3ef95f3201 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 22:15:51 +1100 Subject: [PATCH 3/7] Bump sourmash from 0.15.2 to 0.16.0 (#81) Bumps [sourmash](https://github.com/sourmash-bio/sourmash) from 0.15.2 to 0.16.0. - [Release notes](https://github.com/sourmash-bio/sourmash/releases) - [Changelog](https://github.com/sourmash-bio/sourmash/blob/latest/doc/release.md) - [Commits](https://github.com/sourmash-bio/sourmash/compare/r0.15.2...r0.16.0) --- updated-dependencies: - dependency-name: sourmash dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Adam Taranto --- Cargo.lock | 32 ++++++++++++++++---------------- Cargo.toml | 2 +- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f27edf7..226bae9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -497,9 +497,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -1119,9 +1119,9 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.15.2" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a73bae93170d8d0f816e18b6a630d76e134b90958850985ee2f0fb2f641d4de" +checksum = "596f20eac8896a06ca65889399ea6f408deeba375aa44c4a2efb3b46e31a02c0" dependencies = [ "az", "byteorder", @@ -1311,9 +1311,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", "once_cell", @@ -1322,9 +1322,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", @@ -1337,9 +1337,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1347,9 +1347,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", @@ -1360,15 +1360,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "web-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 84b9cff..03dab77 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,4 +17,4 @@ pyo3 = { version="0.22.5", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } serde_json = "1.0.132" -sourmash = "0.15.2" \ No newline at end of file +sourmash = "0.16.0" \ No newline at end of file From 0f18ad12197d51a3be9df0eb8b07f627bc1c0dea Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Oct 2024 22:30:59 +1100 Subject: [PATCH 4/7] Bump anyhow from 1.0.89 to 1.0.90 (#80) Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.89 to 1.0.90. - [Release notes](https://github.com/dtolnay/anyhow/releases) - [Commits](https://github.com/dtolnay/anyhow/compare/1.0.89...1.0.90) --- updated-dependencies: - dependency-name: anyhow dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Adam Taranto --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 226bae9..50606a2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,9 +89,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.89" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86fdf8605db99b54d3cd748a44c6d04df638eb5dafb219b135d0149bd0db01f6" +checksum = "37bf3594c4c988a53154954629820791dde498571819ae4ca50ca811e060cc95" [[package]] name = "approx" diff --git a/Cargo.toml b/Cargo.toml index 03dab77..6ce1c9e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ name = "oxli" crate-type = ["cdylib"] [dependencies] -anyhow = "1.0.89" +anyhow = "1.0.90" env_logger = "0.11.5" log = "0.4.22" niffler = "2.6.0" From d400eb2bcfd2ffa664301bfc265c65e02af578a6 Mon Sep 17 00:00:00 2001 From: Adam Taranto Date: Mon, 28 Oct 2024 14:34:47 +1100 Subject: [PATCH 5/7] MRG: Make kmers_and_hashes iterable (#70) * Make kmers_and_hashes iterable * Suppress blank output with skip_bad_kmers * update tests to pick up stderr warning instead of PyValueError. Change skip_bad_kmers behaviour. * Implement hash_to_kmer map + unhash() * Tests for count and consume with kmer tracking enabled * rm blank test * Add tests for canon() * Style fixes by Ruff * remove unused tests * Add dump_kmers() * Add tests for dump_kmers() * unsorted dump_kmers() will produce randomly ordered output. Remove from tests. * skip kmers that exist in hash_to_kmer but are missing from count table after being dropped * Add tests for dump_kmers() on KmerCountTables with dropped count records * Style fixes by Ruff * back to action token * disable ruff formatting on dependabot commits * MRG: do a single pass reverse complement in `kmers_and_hashes` (#82) * compute revcomp once, not every time * rm comment * run cargo fmt --------- Co-authored-by: Adamtaranto Co-authored-by: C. Titus Brown --------- --- .github/workflows/ruff.yml | 3 +- src/lib.rs | 399 +++++++++++++++++----- src/python/tests/test_basic.py | 10 +- src/python/tests/test_canonicalization.py | 70 ++++ src/python/tests/test_dump.py | 210 +++++++++++- src/python/tests/test_kmer_map.py | 9 - src/python/tests/test_kmers_and_hashes.py | 212 +++++++++++- src/python/tests/test_output.py | 29 -- 8 files changed, 797 insertions(+), 145 deletions(-) create mode 100644 src/python/tests/test_canonicalization.py delete mode 100644 src/python/tests/test_kmer_map.py delete mode 100644 src/python/tests/test_output.py diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 2ee53d2..5061ce3 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -2,6 +2,7 @@ name: Ruff Formatting on: [pull_request] jobs: ruff: + if: ${{ github.actor != 'dependabot[bot]' }} runs-on: ubuntu-latest permissions: # Give the default GITHUB_TOKEN write permission to commit and push the changed files. @@ -11,7 +12,7 @@ jobs: - uses: actions/checkout@v4 with: ref: ${{ github.sha }} - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{ secrets.ACTION_TOKEN }} - uses: chartboost/ruff-action@v1 with: src: './src/python' diff --git a/src/lib.rs b/src/lib.rs index 0ca4c29..cbded28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -10,7 +10,7 @@ use anyhow::{anyhow, Result}; use log::debug; use niffler::compression::Format; use niffler::get_writer; -use pyo3::exceptions::{PyIOError, PyValueError}; +use pyo3::exceptions::{PyIOError, PyKeyError, PyValueError}; use pyo3::prelude::*; use pyo3::PyResult; use rayon::prelude::*; @@ -30,28 +30,33 @@ struct KmerCountTable { pub ksize: u8, version: String, consumed: u64, + store_kmers: bool, // Store hash:kmer mapping if true + hash_to_kmer: Option>, } #[pymethods] -/// Methods on KmerCountTable. impl KmerCountTable { + /// Constructor for KmerCountTable #[new] - #[pyo3(signature = (ksize))] - pub fn new(ksize: u8) -> Self { + #[pyo3(signature = (ksize, store_kmers=false))] + pub fn new(ksize: u8, store_kmers: bool) -> Self { + // Optional init HashMap for tracking hash:kmer pairs + let hash_to_kmer = if store_kmers { + Some(HashMap::new()) + } else { + None + }; + // Init new KmerCountTable Self { counts: HashMap::new(), ksize, version: VERSION.to_string(), // Initialize the version field consumed: 0, // Initialize the total sequence length tracker + store_kmers, + hash_to_kmer, } } - // TODO: Optionally store hash:kmer pair when counting a new kmer - // Modify KmerCountTable to optionally store map of hash:kmer - // Modify SeqToHashes to return canonical kmer & hash - - // TODO: Add function to get canonical kmer using hash key - /// Turn a k-mer into a hashval. pub fn hash_kmer(&self, kmer: String) -> Result { if kmer.len() as u8 != self.ksize { @@ -71,6 +76,22 @@ impl KmerCountTable { } } + /// Unhash function to retrieve the canonical kmer for a given hash + pub fn unhash(&self, hash: u64) -> PyResult { + if self.store_kmers { + if let Some(kmer) = self.hash_to_kmer.as_ref().unwrap().get(&hash) { + return Ok(kmer.clone()); + } else { + // Raise KeyError if hash does not exist + let msg = format!("Warning: Hash {} not found in table.", hash); + Err(PyKeyError::new_err(msg)) + } + } else { + // Raise an error if store_kmers is false + Err(PyValueError::new_err("K-mer storage is not enabled.")) + } + } + /// Increment the count of a hashval by 1. pub fn count_hash(&mut self, hashval: u64) -> u64 { let count = self.counts.entry(hashval).or_insert(0); @@ -78,6 +99,44 @@ impl KmerCountTable { *count } + /// Return the canonical form of a k-mer: the lexicographically smaller of the k-mer or its reverse complement. + fn canon(&self, kmer: &str) -> PyResult { + // Check if the k-mer length matches the table ksize + if kmer.len() != self.ksize as usize { + return Err(PyValueError::new_err( + "kmer size does not match count table ksize", + )); + } + + // Convert k-mer to uppercase + let kmer_upper = kmer.to_uppercase(); + + // Ensure k-mer contains only valid DNA characters + if !kmer_upper.chars().all(|c| "ATCG".contains(c)) { + return Err(PyValueError::new_err("kmer contains invalid characters")); + } + + // Compute the reverse complement + let rev_comp: String = kmer_upper + .chars() + .rev() + .map(|c| match c { + 'A' => 'T', + 'T' => 'A', + 'C' => 'G', + 'G' => 'C', + _ => c, // This should not happen due to earlier validation + }) + .collect(); + + // Return the lexicographically smaller of kmer or its reverse complement + if kmer_upper <= rev_comp { + Ok(kmer_upper) + } else { + Ok(rev_comp) + } + } + /// Increment the count of a k-mer by 1. pub fn count(&mut self, kmer: String) -> PyResult { if kmer.len() as u8 != self.ksize { @@ -85,10 +144,21 @@ impl KmerCountTable { "kmer size does not match count table ksize", )) } else { - self.consumed += kmer.len() as u64; - let hashval = self.hash_kmer(kmer)?; - let count = self.count_hash(hashval); - Ok(count) + let hashval = self.hash_kmer(kmer.clone())?; // Clone the kmer before passing it to hash_kmer + let count = self.count_hash(hashval); // count with count_hash() function, return tally + self.consumed += kmer.len() as u64; // Add kmer len to total consumed bases + + if self.store_kmers { + // Get the canonical k-mer + let canonical_kmer = self.canon(&kmer)?; + // Optional: Store hash:kmer pair + self.hash_to_kmer + .as_mut() + .unwrap() + .insert(hashval, canonical_kmer); + } + + Ok(count) // Return the current total count for the hash } } @@ -308,6 +378,81 @@ impl KmerCountTable { } } + /// Dump (canonical_kmer,count) pairs, optional sorted by count or canonical kmer. + /// + /// # Arguments + /// * `file` - Optional file path to write the output. If not provided, returns a list of tuples. + /// * `sortkeys` - Optional flag to sort by canonical kmers (default: False). + /// * `sortcounts` - Sort on counts, secondary sort on canonical kmers. (default: False). + #[pyo3(signature = (file=None, sortcounts=false, sortkeys=false))] + pub fn dump_kmers( + &self, + file: Option, + sortcounts: bool, + sortkeys: bool, + ) -> PyResult> { + // Ensure that the hash:kmer mapping is stored + if !self.store_kmers { + return Err(PyValueError::new_err( + "K-mer storage is disabled. No hash:kmer map is available.", + )); + } + + // Raise an error if both sortcounts and sortkeys are true + if sortcounts && sortkeys { + return Err(PyValueError::new_err( + "Cannot sort by both counts and kmers at the same time.", + )); + } + + // Collect canonical k-mers and their counts, skipping those not found in the counts table + let mut kmer_count_pairs: Vec<(&String, &u64)> = self + .hash_to_kmer + .as_ref() + .unwrap() + .par_iter() // Use rayon for parallel iteration + .filter_map(|(&hash, kmer)| { + // Use filter_map to only include (kmer, count) pairs where the count exists + self.counts.get(&hash).map(|count| (kmer, count)) + }) + .collect(); + + // Handle sorting based on the flags + if sortkeys { + // Sort by canonical kmer lexicographically + kmer_count_pairs.par_sort_by_key(|&(kmer, _)| kmer.clone()); + } else if sortcounts { + // Sort by count, secondary sort by kmer + kmer_count_pairs.par_sort_by(|&(kmer1, count1), &(kmer2, count2)| { + count1.cmp(count2).then_with(|| kmer1.cmp(kmer2)) + }); + } + // If both sortcounts and sortkeys are false, no sorting is done. + + // If a file is provided, write to the file + if let Some(filepath) = file { + let f = File::create(filepath)?; + let mut writer = BufWriter::new(f); + + // Write each kmer:count pair to the file + for (kmer, count) in kmer_count_pairs { + writeln!(writer, "{}\t{}", kmer, count)?; + } + + writer.flush()?; // Ensure all data is written to the file + Ok(vec![]) // Return an empty vector when writing to a file + } else { + // Convert the vector of references to owned values + let result: Vec<(String, u64)> = kmer_count_pairs + .into_par_iter() // Use rayon for parallel conversion + .map(|(kmer, &count)| (kmer.clone(), count)) + .collect(); + + // Return the vector of (kmer, count) tuples + Ok(result) + } + } + /// Calculates the frequency histogram for k-mer counts /// Returns a vector of tuples (frequency, count), where 'frequency' is /// the observed number of times a k-mer count occurred and 'count' is @@ -397,35 +542,65 @@ impl KmerCountTable { // exit with error. #[pyo3(signature = (seq, skip_bad_kmers=true))] pub fn consume(&mut self, seq: String, skip_bad_kmers: bool) -> PyResult { - let hashes = SeqToHashes::new( - seq.as_bytes(), - self.ksize.into(), - skip_bad_kmers, - false, - HashFunctions::Murmur64Dna, - 42, - ); - + // Incoming seq len + let new_len = seq.len(); + // Init tally for consumed kmers let mut n = 0; - for hash_value in hashes { - // eprintln!("hash_value: {:?}", hash_value); - match hash_value { - Ok(0) => continue, - Ok(x) => { - self.count_hash(x); - () - } - Err(_) => { - let msg = format!("bad k-mer encountered at position {}", n); - return Err(PyValueError::new_err(msg)); + // If store_kmers is true, then count & log hash:kmer pairs + if self.store_kmers { + // Create an iterator for (canonical_kmer, hash) pairs + let mut iter = KmersAndHashesIter::new(seq, self.ksize as usize, skip_bad_kmers); + + // Iterate over the k-mers and their hashes + while let Some(result) = iter.next() { + match result { + Ok((kmer, hash)) => { + if hash != 0 { + // Insert hash:kmer pair into the hashmap + self.hash_to_kmer + .as_mut() + .unwrap() + .insert(hash, kmer.clone()); + // Increment the count for the hash + *self.counts.entry(hash).or_insert(0) += 1; + // Tally kmers added + n += 1; + } + } + Err(e) => return Err(e), } } + } else { + // Else, hash and count kmers as usual + let hashes = SeqToHashes::new( + seq.as_bytes(), + self.ksize.into(), + skip_bad_kmers, + false, + HashFunctions::Murmur64Dna, + 42, + ); - n += 1; + for hash_value in hashes { + // eprintln!("hash_value: {:?}", hash_value); + match hash_value { + Ok(0) => continue, + Ok(x) => { + self.count_hash(x); + () + } + Err(_) => { + let msg = format!("bad k-mer encountered at position {}", n); + return Err(PyValueError::new_err(msg)); + } + } + + n += 1; + } } // Update the total sequence consumed tracker - self.consumed += seq.len() as u64; + self.consumed += new_len as u64; Ok(n) } @@ -504,58 +679,22 @@ impl KmerCountTable { Ok(()) } + #[pyo3(signature = (seq, skip_bad_kmers=true))] pub fn kmers_and_hashes( &self, seq: String, skip_bad_kmers: bool, ) -> PyResult> { - // TODO: optimize RC calculation - // TODO: confirm that there are no more hashes left? unreachable? - let seq = seq.to_ascii_uppercase(); - let seqb = seq.as_bytes(); - - let mut hasher = SeqToHashes::new( - seqb, - self.ksize.into(), - skip_bad_kmers, - false, - HashFunctions::Murmur64Dna, - 42, - ); + let mut v: Vec<(String, u64)> = vec![]; - let ksize = self.ksize as usize; - let end: usize = seq.len() - ksize + 1; + // Create the iterator + let mut iter = KmersAndHashesIter::new(seq, self.ksize as usize, skip_bad_kmers); - let mut v: Vec<(String, u64)> = vec![]; - for start in 0..end { - let substr = &seq[start..start + ksize]; - // CTB: this calculates RC each time, instead of doing so - // using a sliding window. It's easy and works, so I'm - // starting here :). - let substr_b_rc = revcomp(&seqb[start..start + ksize]); - let substr_rc = - std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp"); - let hashval = hasher.next().expect("should not run out of hashes"); - - // Three options: - // * good kmer, all is well, store canonical k-mer and hashval; - // * bad k-mer allowed by skip_bad_kmers, and signaled by - // hashval == 0): return empty string & 0; - // * bad k-mer not allowed, raise error - if let Ok(hashval) = hashval { - if hashval > 0 { - let canonical_kmer = if substr < substr_rc { - substr - } else { - substr_rc - }; - v.push((canonical_kmer.to_string(), hashval)); - } else { - v.push(("".to_owned(), 0)); - } - } else { - let msg = format!("bad k-mer at position {}: {}", start, substr); - return Err(PyValueError::new_err(msg)); + // Collect the k-mers and their hashes + while let Some(result) = iter.next() { + match result { + Ok((kmer, hash)) => v.push((kmer, hash)), + Err(e) => return Err(e), } } @@ -638,6 +777,104 @@ impl KmerCountTableIterator { } } +pub struct KmersAndHashesIter { + seq: String, // The sequence to iterate over + seq_rc: String, // reverse complement sequence + ksize: usize, // K-mer size + pos: usize, // Current position in the sequence + end: usize, // The end position for k-mer extraction + hasher: SeqToHashes, // Iterator for generating hashes + skip_bad_kmers: bool, // Flag to skip bad k-mers +} + +impl KmersAndHashesIter { + pub fn new(seq: String, ksize: usize, skip_bad_kmers: bool) -> Self { + let seq = seq.to_ascii_uppercase(); // Ensure uppercase for uniformity + let seqb = seq.as_bytes().to_vec(); // Convert to bytes for hashing + let seqb_rc = revcomp(&seqb); + let seq_rc = std::str::from_utf8(&seqb_rc) + .expect("invalid utf-8 sequence for rev comp") + .to_string(); + + let end = seq.len() - ksize + 1; // Calculate the endpoint for k-mer extraction + let hasher = SeqToHashes::new( + &seqb, + ksize.into(), + true, // Set force to true, bad kmers will emit hash=0 instead of killing process + false, // Other flags, e.g., reverse complement + HashFunctions::Murmur64Dna, + 42, // Seed for hashing + ); + + Self { + seq, + seq_rc, + ksize, + pos: 0, // Start at the beginning of the sequence + end, + hasher, + skip_bad_kmers, + } + } +} + +impl Iterator for KmersAndHashesIter { + type Item = PyResult<(String, u64)>; + + fn next(&mut self) -> Option { + // Check if we've reached the end of the sequence + if self.pos >= self.end { + return None; + } + + let start = self.pos; + let ksize = self.ksize; + let rpos = self.end - start - 1; + + // Extract the current k-mer and its reverse complement + let substr = &self.seq[start..start + ksize]; + let substr_rc = &self.seq_rc[rpos..rpos + ksize]; + + // Get the next hash value from the hasher + let hashval = self.hasher.next().expect("should not run out of hashes"); + + // Increment position for the next k-mer + self.pos += 1; + + // Handle hash value logic + if let Ok(hashval) = hashval { + // Good kmer, all is well, store canonical k-mer and hashval; + if hashval > 0 { + // Select the canonical k-mer (lexicographically smaller between forward and reverse complement) + let canonical_kmer = if substr < substr_rc { + substr + } else { + substr_rc + }; + // If valid hash, return (canonical_kmer,hashval) tuple + Some(Ok((canonical_kmer.to_string(), hashval))) + } else { + // If the hash is 0, handle based on `skip_bad_kmers` + // Prepare msg identifying bad kmer + let msg = format!("bad k-mer at position {}: {}", start + 1, substr); + if self.skip_bad_kmers { + // Print a message and skip adding the bad k-mer to the result + eprintln!("{}", msg); + self.next() // Recursively call `next()` to skip this k-mer + } else { + // If skip_bad_kmer is false, return an empty string and 0, but still print a message + eprintln!("{}", msg); + Some(Ok(("".to_string(), 0))) + } + } + } else { + // If error raised by SeqToHashes + let msg = format!("bad k-mer at position {}: {}", start + 1, substr); + Some(Err(PyValueError::new_err(msg))) + } + } +} + // Python module definition #[pymodule] fn oxli(m: &Bound<'_, PyModule>) -> PyResult<()> { diff --git a/src/python/tests/test_basic.py b/src/python/tests/test_basic.py index 0992b91..62262f6 100644 --- a/src/python/tests/test_basic.py +++ b/src/python/tests/test_basic.py @@ -166,8 +166,8 @@ def test_get_hash_array(): assert rev_counts == [0, 1, 2], "Count should be in same order as input list" -def test_get_array(): - """ - Get vector of counts corresponding to vector of kmers. - """ - pass +# def test_get_array(): +# """ +# Get vector of counts corresponding to vector of kmers. +# """ +# pass diff --git a/src/python/tests/test_canonicalization.py b/src/python/tests/test_canonicalization.py new file mode 100644 index 0000000..46cc639 --- /dev/null +++ b/src/python/tests/test_canonicalization.py @@ -0,0 +1,70 @@ +import pytest + +import oxli + + +def test_canon_kmer(): + """Test the canon() function to ensure it returns the lexicographically smaller k-mer.""" + kmer_table = oxli.KmerCountTable(ksize=4, store_kmers=True) + + # Test k-mer with reverse complement + assert kmer_table.canon("AAAA") == "AAAA", "Expected canonical form to be 'AAAA'" + assert kmer_table.canon("TTTT") == "AAAA", "Expected canonical form to be 'AAAA'" + assert kmer_table.canon("ATCG") == "ATCG", "Expected canonical form to be 'CGAT'" + assert kmer_table.canon("CGAT") == "ATCG", "Expected canonical form to be 'CGAT'" + + +def test_count_with_canonical_kmer(): + """Test the count() function to ensure it stores the canonical k-mer.""" + kmer_table = oxli.KmerCountTable(ksize=4, store_kmers=True) + kmer = "TTTT" + # Count a k-mer and its reverse complement + kmer_table.count(kmer) + kmer_table.count(kmer) + + # Check that the canonical k-mer is stored + hashval = kmer_table.hash_kmer(kmer) + assert kmer_table.unhash(hashval) == "AAAA", "Expected canonical k-mer 'AAAA'" + + # Check that the count for the canonical k-mer is correct (should be 2) + assert kmer_table.get_hash(hashval) == 2, "Expected count of 2 for k-mer 'AAAA'" + + +def test_canon_invalid_kmer_size(): + """ + Test that canon() raises a ValueError when the k-mer length does not match the expected ksize. + """ + kmer_table = oxli.KmerCountTable( + ksize=4, store_kmers=True + ) # Create a KmerCountTable with ksize=4 + + # K-mer too short + with pytest.raises(ValueError, match="kmer size does not match count table ksize"): + kmer_table.canon("AAA") # 3-mer for a 4-mer table should raise an error + + # K-mer too long + with pytest.raises(ValueError, match="kmer size does not match count table ksize"): + kmer_table.canon("AAAAA") # 5-mer for a 4-mer table should raise an error + + +def test_canon_invalid_dna_characters(): + """ + Test that canon() raises a ValueError when the k-mer contains non-DNA characters. + """ + kmer_table = oxli.KmerCountTable( + ksize=4, store_kmers=True + ) # Create a KmerCountTable with ksize=4 + + # Test lowercase conversion + canon_g = kmer_table.canon("gggg") + assert canon_g == "CCCC", "Lowercase gggg should be converted to CCCC" + + # K-mer with non-DNA character 'X' + with pytest.raises(ValueError, match="kmer contains invalid characters"): + kmer_table.canon("ATXG") # Invalid character 'X' should raise an error + + # K-mer with lowercase and invalid character 'B' + with pytest.raises(ValueError, match="kmer contains invalid characters"): + kmer_table.canon( + "aTbG" + ) # Lowercase is fine, but 'b' is not a valid DNA character diff --git a/src/python/tests/test_dump.py b/src/python/tests/test_dump.py index 92a7be8..30cc7cc 100644 --- a/src/python/tests/test_dump.py +++ b/src/python/tests/test_dump.py @@ -9,7 +9,7 @@ @pytest.fixture def kmer_count_table(): """Fixture to set up a KmerCountTable instance with sample data.""" - kct = KmerCountTable(ksize=4) + kct = KmerCountTable(ksize=4, store_kmers=True) kct.count("AAAA") # 17832910516274425539 kct.count("TTTT") # 17832910516274425539 kct.count("AATT") # 382727017318141683 @@ -21,7 +21,7 @@ def kmer_count_table(): @pytest.fixture def empty_kmer_count_table(): """Fixture to set up an empty KmerCountTable instance.""" - return KmerCountTable(ksize=4) + return KmerCountTable(ksize=4, store_kmers=True) def test_dump_conflicting_sort_options(kmer_count_table): @@ -165,3 +165,209 @@ def test_dump_hash_empty_table(empty_kmer_count_table): # Cleanup remove(temp_file_path) + + +# Tests for dump_kmers() + + +def test_dump_kmers_conflicting_sort_options(kmer_count_table): + """Test that passing both sortcounts=True and sortkeys=True raises a ValueError.""" + with pytest.raises( + ValueError, match="Cannot sort by both counts and kmers at the same time." + ): + kmer_count_table.dump_kmers(file=None, sortcounts=True, sortkeys=True) + + +def test_dump_kmers_sortcounts_with_ties(kmer_count_table): + """Test the dump_kmers function with sortcounts=True, ensuring it handles ties in counts.""" + result = kmer_count_table.dump_kmers(file=None, sortcounts=True, sortkeys=False) + + # Expected output sorted by count, with secondary sorting by kmer for ties + expected = [ + ("AATT", 1), + ( + "AAAA", + 2, + ), # 'AAAA'/'TTTT' is tied with 'GGGG / CCCC' on counts, 'AAAA' is lexicographically smaller + ("CCCC", 2), + ] + + assert result == expected, f"Expected {expected}, but got {result}" + + +def test_dump_kmers_single_kmer(): + """Test the dump_kmers function with only a single k-mer counted.""" + kct = KmerCountTable(ksize=4, store_kmers=True) + kct.count("AAAA") # Canonical kmer: 'AAAA' + + result = kct.dump_kmers(file=None, sortcounts=True, sortkeys=False) + + expected = [("AAAA", 1)] + + assert result == expected, f"Expected {expected}, but got {result}" + + +def test_dump_kmers_write_to_file(kmer_count_table): + """Test the dump_kmers function when writing to a file. + + This test checks if the function correctly writes the kmer:count pairs to a file. + """ + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file_path = temp_file.name + + kmer_count_table.dump_kmers(file=temp_file_path, sortcounts=True, sortkeys=False) + + with open(temp_file_path, "r") as f: + lines = f.readlines() + + # Expected output sorted by count then kmer (default behavior) + expected_lines = [ + f"AATT\t1\n", + f"AAAA\t2\n", # 'AAAA'/'TTTT' + f"CCCC\t2\n", + ] + + assert lines == expected_lines, f"Expected {expected_lines}, but got {lines}" + + # Cleanup + remove(temp_file_path) + + +def test_dump_kmers_write_to_file_sortkeys(kmer_count_table): + """Test the dump_kmers function with sortkeys=True when writing to a file.""" + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file_path = temp_file.name + + kmer_count_table.dump_kmers(file=temp_file_path, sortkeys=True) + + with open(temp_file_path, "r") as f: + lines = f.readlines() + + # Expected output sorted by canonical kmers + expected_lines = [ + f"AAAA\t2\n", # 'AAAA'/'TTTT' + f"AATT\t1\n", + f"CCCC\t2\n", + ] + + assert lines == expected_lines, f"Expected {expected_lines}, but got {lines}" + + # Cleanup + remove(temp_file_path) + + +def test_dump_kmers_sortkeys(kmer_count_table): + """Test the dump_kmers function with sortkeys=True. + + This test verifies if the function sorts by canonical k-mers when `sortkeys` is set to True. + """ + result = kmer_count_table.dump_kmers(file=None, sortkeys=True) + + # Expected output sorted by canonical kmer + expected = [ + ("AAAA", 2), # 'AAAA'/'TTTT' + ("AATT", 1), + ("CCCC", 2), + ] + + assert result == expected, f"Expected {expected}, but got {result}" + + +def test_dump_kmers_invalid_file_path(kmer_count_table): + """Test that passing an invalid file path raises an error.""" + with pytest.raises(OSError): + kmer_count_table.dump_kmers(file="", sortkeys=True) + + +def test_dump_kmers_empty_table(empty_kmer_count_table): + """Test the dump_kmers function on an empty KmerCountTable. + + This test checks that the function handles an empty table correctly. + """ + # Test that calling dump_kmers without file returns an empty list + result = empty_kmer_count_table.dump_kmers(file=None, sortkeys=False) + assert result == [], "Expected an empty list from an empty KmerCountTable" + + # Test that calling dump_kmers with a file writes nothing to the file + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file_path = temp_file.name + + empty_kmer_count_table.dump_kmers(file=temp_file_path, sortkeys=False) + + with open(temp_file_path, "r") as f: + lines = f.readlines() + + assert lines == [], "Expected an empty file for an empty KmerCountTable" + + # Cleanup + remove(temp_file_path) + + +def test_drop_removes_kmer(kmer_count_table): + """ + Test that the `drop()` method correctly removes a k-mer using its string representation. + Verify that the `dump_kmers()` function returns the remaining (kmer, count) pairs. + """ + # Drop the k-mer "AATT" + kmer_count_table.drop("AATT") + + # Get the remaining k-mers using dump_kmers + remaining_kmers = kmer_count_table.dump_kmers() + + # Check that "AATT" has been removed and other k-mers are still present + assert ("AATT", 1) not in remaining_kmers + assert ("AAAA", 2) in remaining_kmers + assert ("CCCC", 2) in remaining_kmers + + +def test_drop_hash_removes_kmer(kmer_count_table): + """ + Test that the `drop_hash()` method correctly removes a k-mer using its hash value. + Verify that the `dump_kmers()` function returns the remaining (kmer, count) pairs. + """ + # Hash of "GGGG" is 73459868045630124 + kmer_count_table.drop_hash(73459868045630124) + + # Get the remaining k-mers using dump_kmers + remaining_kmers = kmer_count_table.dump_kmers() + + # Check that "GGGG/CCCC" has been removed and other k-mers are still present + assert ("CCCC", 2) not in remaining_kmers + assert ("AAAA", 2) in remaining_kmers + assert ("AATT", 1) in remaining_kmers + + +def test_mincut_removes_low_count_kmers(kmer_count_table): + """ + Test that the `mincut()` method correctly removes k-mers with counts below a threshold. + Verify that the `dump_kmers()` function returns the remaining (kmer, count) pairs. + """ + # Remove all k-mers with counts less than 2 + kmer_count_table.mincut(2) + + # Get the remaining k-mers using dump_kmers + remaining_kmers = kmer_count_table.dump_kmers() + + # Check that only "GGGG/CCCC" remains because its count is 2 + assert len(remaining_kmers) == 2 + assert ("CCCC", 2) in remaining_kmers + assert ("AAAA", 2) in remaining_kmers + assert ("AATT", 1) not in remaining_kmers + + +def test_maxcut_removes_high_count_kmers(kmer_count_table): + """ + Test that the `maxcut()` method correctly removes k-mers with counts above a threshold. + Verify that the `dump_kmers()` function returns the remaining (kmer, count) pairs. + """ + # Remove all k-mers with counts greater than 1 + kmer_count_table.maxcut(1) + + # Get the remaining k-mers using dump_kmers + remaining_kmers = kmer_count_table.dump_kmers() + + # Check that "GGGG/CCCC" has been removed and other k-mers with count 1 remain + assert len(remaining_kmers) == 1 + assert ("CCCC", 2) not in remaining_kmers + assert ("AAAA", 2) not in remaining_kmers + assert ("AATT", 1) in remaining_kmers diff --git a/src/python/tests/test_kmer_map.py b/src/python/tests/test_kmer_map.py deleted file mode 100644 index aa9850c..0000000 --- a/src/python/tests/test_kmer_map.py +++ /dev/null @@ -1,9 +0,0 @@ -import pytest - -import oxli -from test_basic import create_sample_kmer_table - - -def test_kmermap(): - """Test option to add kmermap""" - pass diff --git a/src/python/tests/test_kmers_and_hashes.py b/src/python/tests/test_kmers_and_hashes.py index e58656e..31c4e58 100644 --- a/src/python/tests/test_kmers_and_hashes.py +++ b/src/python/tests/test_kmers_and_hashes.py @@ -3,14 +3,6 @@ import oxli -# Helper function, create tables. -def create_sample_kmer_table(ksize, kmers): - table = oxli.KmerCountTable(ksize) - for kmer in kmers: - table.count(kmer) - return table - - def test_basic(): "string containing only forward canonical kmers." seq = "ATAAACC" # all forward k-mers @@ -72,30 +64,38 @@ def test_basic_lower(): ] -def test_bad_kmers_raise_error(): - "Test that bad k-mers raise a ValueError with info" +def test_bad_kmers_raise_warning(capfd): + "Test that bad k-mers print warning with info" seq = "acxttg" cg = oxli.KmerCountTable(ksize=4) - with pytest.raises(ValueError, match="bad k-mer at position 0: ACXT"): - x = cg.kmers_and_hashes(seq, False) + # Capture stderr output + x = cg.kmers_and_hashes(seq, False) + captured = capfd.readouterr() + + # Check for warning in stderr + assert f"bad k-mer at position 1: ACXT" in captured.err -def test_bad_kmers_raise_error_2(): +def test_bad_kmers_raise_warning_2(capfd): "Test bad k-mers raise the right error even when not at beginning :)" seq = "aattxttgg" cg = oxli.KmerCountTable(ksize=4) - with pytest.raises(ValueError, match="bad k-mer at position 1: ATTX"): - x = cg.kmers_and_hashes(seq, False) + # Capture stderr output + x = cg.kmers_and_hashes(seq, False) + captured = capfd.readouterr() + + # Check for warning in stderr + assert f"bad k-mer at position 2: ATTX" in captured.err -def test_bad_kmers_allowed(): - "Test that bad k-mers are allowed when skip_bad_kmers is True" +def test_report_bad_kmers(): + "Test that bad k-mers are reported as (" ",0) when skip_bad_kmers is False" seq = "aattxttgg" cg = oxli.KmerCountTable(ksize=4) - x = cg.kmers_and_hashes(seq, True) + x = cg.kmers_and_hashes(seq, False) print(x) assert x == [ ("AATT", 382727017318141683), @@ -105,3 +105,179 @@ def test_bad_kmers_allowed(): ("", 0), ("CCAA", 1798905482136869687), ] + + +def test_skip_bad_kmers(): + "Test that bad k-mers are ommited when skip_bad_kmers is True" + seq = "aattxttgg" + cg = oxli.KmerCountTable(ksize=4) + + x = cg.kmers_and_hashes(seq, True) + print(x) + assert x == [ + ("AATT", 382727017318141683), + ("CCAA", 1798905482136869687), + ] + + +# Tests for hash:kmer storage and retreival + + +def test_count_saves_kmer(): + """Test that count() stores k-mers and their corresponding hashes when store_kmers=True.""" + kmer = "AAAA" + cg = oxli.KmerCountTable(ksize=4, store_kmers=True) + + # Call count() on a k-mer + count = cg.count(kmer) + + # Check that the k-mer was counted + assert count == 1, f"Expected count to be 1 after first insertion, but got {count}" + + # Hash value of the k-mer should now exist in the hash_to_kmer map + hashval = cg.hash_kmer(kmer) + + # Check that the k-mer is stored correctly in the hash_to_kmer map + stored_kmer = cg.unhash(hashval) + assert ( + stored_kmer == kmer + ), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}" + + +def test_count_saves_canonical_kmer(): + """Test that count() stores correct canonical form of k-mers and their corresponding hashes when store_kmers=True.""" + cg = oxli.KmerCountTable(ksize=4, store_kmers=True) + kmer = "TTTT" + canon_kmer = "AAAA" + + # Call count() on a k-mer + cg.count(kmer) + + # Hash value of the k-mer should now exist in the hash_to_kmer map + hashval = cg.hash_kmer(kmer) + + # Check that the k-mer is stored correctly in the hash_to_kmer map + stored_kmer = cg.unhash(hashval) + + assert ( + stored_kmer == canon_kmer + ), f"Expected stored k-mer to be {canon_kmer}, but got {stored_kmer}" + + +def test_consume_saves_kmers(): + """Test that consume() processes a sequence and stores k-mers and their hashes.""" + seq = "ACGTTG" + cg = oxli.KmerCountTable(ksize=4, store_kmers=True) + + # Consume the sequence, expecting 3 k-mers ("ACGT", "AACG", "CAAC") + n_kmers = cg.consume(seq) + + # Check that 3 k-mers were processed + assert n_kmers == 3, f"Expected to consume 3 k-mers, but got {n_kmers}" + + # Check that all k-mers are stored in the hash_to_kmer map + for kmer in ["ACGT", "AACG", "CAAC"]: + hashval = cg.hash_kmer(kmer) + stored_kmer = cg.unhash(hashval) + assert ( + stored_kmer == kmer + ), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}" + + +def test_count_increments_kmer(): + """Test that count() increments the count of a k-mer when called multiple times.""" + kmer = "AAAA" + rev_kmer = "TTTT" + cg = oxli.KmerCountTable(ksize=4, store_kmers=True) + + # Call count() twice on the same k-mer + count1 = cg.count(kmer) + count2 = cg.count(rev_kmer) + + # Check that the count has incremented + assert ( + count1 == 1 + ), f"Expected count to be 1 after first insertion, but got {count1}" + assert ( + count2 == 2 + ), f"Expected count to be 2 after second insertion, but got {count2}" + + # Ensure the k-mer is still stored correctly in hash_to_kmer + hashval = cg.hash_kmer(kmer) + stored_kmer = cg.unhash(hashval) + assert ( + stored_kmer == kmer + ), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}" + + +def test_consume_increments_kmers(): + """Test that consume() increments k-mer counts when the same k-mers are encountered.""" + sequence = "AAAAACCCC" # Contains overlapping "AAAA" twice + cg = oxli.KmerCountTable(ksize=4, store_kmers=True) + + # Consume the sequence, expecting 6 k-mers (AAAA, AAAA, AAAC, AACC, ACCC, CCCC) + n_kmers = cg.consume(sequence) + + # Check that 6 k-mers were processed + assert n_kmers == 6, f"Expected to consume 6 k-mers, but got {n_kmers}" + + # Check that the count for "AAAA" is now 2 + assert cg.get("AAAA") == 2, "Expected count for 'AAAA' to be 2" + + +def test_unhash_invalid_kmer(): + """Test that unhash() raises an error when given an invalid hash.""" + cg = oxli.KmerCountTable(ksize=4, store_kmers=True) + cg.count("AAAA") + + invalid_hash = 1234567890 # A hash that doesn't exist + + # Expecting an exception when trying to unhash an invalid value + with pytest.raises( + KeyError, match=f"Warning: Hash {invalid_hash} not found in table." + ): + cg.unhash(invalid_hash) + + +def test_unhash_no_kmer_table(): + """Test that unhash() raises an error when used on a count table without kmer tracking.""" + cg = oxli.KmerCountTable(ksize=3, store_kmers=False) + kmer = "AAA" + cg.count(kmer) + + real_hash = cg.hash_kmer(kmer) + + # Expecting an exception when trying to unhash an invalid value + with pytest.raises(ValueError, match="K-mer storage is not enabled."): + cg.unhash(real_hash) + + +def test_consume_invalid_kmers(capfd): + """Test that consume() processes a sequence and stores k-mers and their hashes.""" + seq = "XAAAAAXGGGG" + cg = oxli.KmerCountTable(ksize=3, store_kmers=True) + + # Consume the sequence, expecting 5 k-mers ("AAA", "AAA", "AAA", "GGG", "GGG") + n_kmers = cg.consume(seq) # [(10679328328772601858, 3), (12126843654075378313, 2)] + # Capture stderr warnings for bad kmers + captured = capfd.readouterr() + + # Check for warnings in stderr + assert "bad k-mer at position 1: XAA" in captured.err + assert "bad k-mer at position 5: AAX" in captured.err + assert "bad k-mer at position 6: AXG" in captured.err + assert "bad k-mer at position 7: XGG" in captured.err + + # Check that 5 k-mers were processed + assert n_kmers == 5, f"Expected to consume 2 k-mers, but got {n_kmers}" + + # Check 2 distinct kmers + assert len(cg) == 2, "Expected exactly 2 distinct kmers" + + # Check that all k-mers are stored in the hash_to_kmer map + for kmer in ["AAA", "CCC"]: + hashval = cg.hash_kmer(kmer) + stored_kmer = cg.unhash(hashval) + assert ( + stored_kmer == kmer + ), f"Expected stored k-mer to be {kmer}, but got {stored_kmer}" diff --git a/src/python/tests/test_output.py b/src/python/tests/test_output.py deleted file mode 100644 index 8dde753..0000000 --- a/src/python/tests/test_output.py +++ /dev/null @@ -1,29 +0,0 @@ -import pytest - -import oxli -from test_basic import create_sample_kmer_table - - -def test_serialise(): - """Serialise object to JSON""" - pass - - -def test_deserialise(): - """Load object from file.""" - pass - - -def test_dump(): - """Write tab delimited kmer:count pairs""" - pass - - -def test_dump_hash(): - """Write tab delimited hash_count pairs""" - pass - - -def test_histo(): - """Write frequency counts.""" - pass From 38bd24913b58c3ecf2b48505cc18d62b2d13e635 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:05:17 +1100 Subject: [PATCH 6/7] Bump anyhow from 1.0.90 to 1.0.91 (#84) Bumps [anyhow](https://github.com/dtolnay/anyhow) from 1.0.90 to 1.0.91. - [Release notes](https://github.com/dtolnay/anyhow/releases) - [Commits](https://github.com/dtolnay/anyhow/compare/1.0.90...1.0.91) --- updated-dependencies: - dependency-name: anyhow dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 50606a2..43872f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,9 +89,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.90" +version = "1.0.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37bf3594c4c988a53154954629820791dde498571819ae4ca50ca811e060cc95" +checksum = "c042108f3ed77fd83760a5fd79b53be043192bb3b9dba91d8c574c0ada7850c8" [[package]] name = "approx" diff --git a/Cargo.toml b/Cargo.toml index 6ce1c9e..8f95c10 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ name = "oxli" crate-type = ["cdylib"] [dependencies] -anyhow = "1.0.90" +anyhow = "1.0.91" env_logger = "0.11.5" log = "0.4.22" niffler = "2.6.0" From 5f27d68cb96c9a5b96f3862402ab5c3a20aa4c72 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:14:20 +1100 Subject: [PATCH 7/7] Bump serde from 1.0.210 to 1.0.213 (#85) Bumps [serde](https://github.com/serde-rs/serde) from 1.0.210 to 1.0.213. - [Release notes](https://github.com/serde-rs/serde/releases) - [Commits](https://github.com/serde-rs/serde/compare/v1.0.210...v1.0.213) --- updated-dependencies: - dependency-name: serde dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Adam Taranto --- Cargo.lock | 12 ++++++------ Cargo.toml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 43872f7..0f0e425 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1056,18 +1056,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.210" +version = "1.0.213" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" +checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.210" +version = "1.0.213" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" +checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" dependencies = [ "proc-macro2", "quote", @@ -1192,9 +1192,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.77" +version = "2.0.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +checksum = "5023162dfcd14ef8f32034d8bcd4cc5ddc61ef7a247c024a33e24e1f24d21b56" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 8f95c10..76260e2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,6 @@ log = "0.4.22" niffler = "2.6.0" pyo3 = { version="0.22.5", features = ["extension-module", "anyhow"] } rayon = "1.10.0" -serde = { version = "1.0.210", features = ["derive"] } +serde = { version = "1.0.213", features = ["derive"] } serde_json = "1.0.132" sourmash = "0.16.0" \ No newline at end of file