From 28aaf178d5d6b609cc91bb5ccdf5b4350237890c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 3 Sep 2024 17:01:37 -0700 Subject: [PATCH] add consume_file --- Cargo.lock | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/lib.rs | 41 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 88 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bb07583..c4bd19b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -95,6 +95,27 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "camino" version = "1.1.9" @@ -362,6 +383,17 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "matrixmultiply" version = "0.3.9" @@ -454,7 +486,10 @@ checksum = "db05a5ab397f64070d8c998fa0fbb84e484b81f95752af317dac183a82d9295d" dependencies = [ "buffer-redux", "bytecount", + "bzip2", + "flate2", "memchr", + "xz2", ] [[package]] @@ -559,6 +594,7 @@ name = "oxli" version = "0.2.2" dependencies = [ "anyhow", + "needletail", "pyo3", "sourmash", ] @@ -585,6 +621,12 @@ dependencies = [ "thiserror", ] +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + [[package]] name = "portable-atomic" version = "1.7.0" @@ -1213,6 +1255,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "yansi" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index b0043a7..eeb0449 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,3 +12,4 @@ crate-type = ["cdylib"] pyo3 = { version="0.22.2", features = ["extension-module", "anyhow"] } sourmash = "0.15.1" anyhow = "1.0.86" +needletail = "0.5.1" \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 5cc9322..3978336 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,9 @@ use pyo3::prelude::*; use anyhow::{anyhow, Result}; use std::collections::HashMap; +extern crate needletail; +use needletail::{parse_fastx_file, Sequence, FastxReader}; + // use sourmash::sketch::nodegraph::Nodegraph; use sourmash::_hash_murmur; use sourmash::encodings::HashFunctions; @@ -85,11 +88,9 @@ impl KmerCountTable { } } - // Consume this DNA strnig. Return number of k-mers consumed. - #[pyo3(signature = (seq, allow_bad_kmers=true))] - pub fn consume(&mut self, seq: String, allow_bad_kmers: bool) -> PyResult { + fn consume_bytes(&mut self, seq: &[u8], allow_bad_kmers: bool) -> Result { let hashes = SeqToHashes::new( - seq.as_bytes(), + seq, self.ksize.into(), allow_bad_kmers, false, @@ -108,7 +109,7 @@ impl KmerCountTable { } Err(_) => { let msg = format!("bad k-mer encountered at position {}", n); - return Err(PyValueError::new_err(msg)); + return Err(anyhow!(msg)); } } @@ -117,6 +118,36 @@ impl KmerCountTable { Ok(n) } + + // Consume this DNA string. Return number of k-mers consumed. + #[pyo3(signature = (seq, allow_bad_kmers=true))] + pub fn consume(&mut self, seq: String, allow_bad_kmers: bool) -> PyResult { + match self.consume_bytes(seq.as_bytes(), allow_bad_kmers) { + Ok(n) => Ok(n), + Err(_) => { + // let msg = format!("bad k-mer encountered at position {}", n); + let msg = format!("invalid character in sequence"); + Err(PyValueError::new_err(msg)) + } + } + } + + #[pyo3(signature = (filename, allow_bad_kmers=true))] + pub fn consume_file(&mut self, filename: String, allow_bad_kmers: bool) -> PyResult { + let mut n = 0; + let mut reader = parse_fastx_file(&filename).expect("foo"); + + while let Some(record) = reader.next() { + let record = record.expect("invalid record"); + let normseq = record.normalize(false); + + match self.consume_bytes(&normseq.into_owned(), allow_bad_kmers) { + Ok(n_kmers) => n = n + n_kmers, + Err(msg) => return Err(PyValueError::new_err("oops")), + } + } + Ok(n) + } } #[pymodule]