Skip to content

Commit

Permalink
add consume_file
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Sep 4, 2024
1 parent 2cf8f0e commit 28aaf17
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 5 deletions.
51 changes: 51 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ crate-type = ["cdylib"]
pyo3 = { version="0.22.2", features = ["extension-module", "anyhow"] }
sourmash = "0.15.1"
anyhow = "1.0.86"
needletail = "0.5.1"
41 changes: 36 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ use pyo3::prelude::*;
use anyhow::{anyhow, Result};
use std::collections::HashMap;

extern crate needletail;
use needletail::{parse_fastx_file, Sequence, FastxReader};

// use sourmash::sketch::nodegraph::Nodegraph;
use sourmash::_hash_murmur;
use sourmash::encodings::HashFunctions;
Expand Down Expand Up @@ -85,11 +88,9 @@ impl KmerCountTable {
}
}

// Consume this DNA strnig. Return number of k-mers consumed.
#[pyo3(signature = (seq, allow_bad_kmers=true))]
pub fn consume(&mut self, seq: String, allow_bad_kmers: bool) -> PyResult<u64> {
fn consume_bytes(&mut self, seq: &[u8], allow_bad_kmers: bool) -> Result<u64> {
let hashes = SeqToHashes::new(
seq.as_bytes(),
seq,
self.ksize.into(),
allow_bad_kmers,
false,
Expand All @@ -108,7 +109,7 @@ impl KmerCountTable {
}
Err(_) => {
let msg = format!("bad k-mer encountered at position {}", n);
return Err(PyValueError::new_err(msg));
return Err(anyhow!(msg));
}
}

Expand All @@ -117,6 +118,36 @@ impl KmerCountTable {

Ok(n)
}

// Consume this DNA string. Return number of k-mers consumed.
#[pyo3(signature = (seq, allow_bad_kmers=true))]
pub fn consume(&mut self, seq: String, allow_bad_kmers: bool) -> PyResult<u64> {
match self.consume_bytes(seq.as_bytes(), allow_bad_kmers) {
Ok(n) => Ok(n),
Err(_) => {
// let msg = format!("bad k-mer encountered at position {}", n);
let msg = format!("invalid character in sequence");
Err(PyValueError::new_err(msg))
}
}
}

#[pyo3(signature = (filename, allow_bad_kmers=true))]
pub fn consume_file(&mut self, filename: String, allow_bad_kmers: bool) -> PyResult<u64> {
let mut n = 0;
let mut reader = parse_fastx_file(&filename).expect("foo");

while let Some(record) = reader.next() {
let record = record.expect("invalid record");
let normseq = record.normalize(false);

match self.consume_bytes(&normseq.into_owned(), allow_bad_kmers) {
Ok(n_kmers) => n = n + n_kmers,
Err(msg) => return Err(PyValueError::new_err("oops")),
}
}
Ok(n)
}
}

#[pymodule]
Expand Down

0 comments on commit 28aaf17

Please sign in to comment.