From 6fa36d27d398f2fa677aa13836c57ce1a147c4f3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 27 Oct 2024 16:32:39 -0700 Subject: [PATCH] compute revcomp once, not every time --- src/lib.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4f639c1..1f968f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -778,8 +778,8 @@ impl KmerCountTableIterator { } pub struct KmersAndHashesIter { - seq: String, // The sequence to iterate over - seqb: Vec, // Sequence bytes + seq: String, // The sequence to iterate over + seq_rc: String, // reverse complement sequence ksize: usize, // K-mer size pos: usize, // Current position in the sequence end: usize, // The end position for k-mer extraction @@ -791,8 +791,12 @@ impl KmersAndHashesIter { pub fn new(seq: String, ksize: usize, skip_bad_kmers: bool) -> Self { let seq = seq.to_ascii_uppercase(); // Ensure uppercase for uniformity let seqb = seq.as_bytes().to_vec(); // Convert to bytes for hashing - let end = seq.len() - ksize + 1; // Calculate the endpoint for k-mer extraction + let seqb_rc = revcomp(&seqb); + let seq_rc = std::str::from_utf8(&seqb_rc) + .expect("invalid utf-8 sequence for rev comp") + .to_string(); + let end = seq.len() - ksize + 1; // Calculate the endpoint for k-mer extraction let hasher = SeqToHashes::new( &seqb, ksize.into(), @@ -804,7 +808,7 @@ impl KmersAndHashesIter { Self { seq, - seqb, + seq_rc, ksize, pos: 0, // Start at the beginning of the sequence end, @@ -825,15 +829,15 @@ impl Iterator for KmersAndHashesIter { let start = self.pos; let ksize = self.ksize; + let rpos = self.end - start - 1; // Extract the current k-mer and its reverse complement let substr = &self.seq[start..start + ksize]; + // let substr_rc = &self.seq_rc[start..start+ksize] // CTB: this calculates RC each time, instead of doing so // using a sliding window. It's easy and works, so I'm // starting here :). - let substr_b_rc = revcomp(&self.seqb[start..start + ksize]); - let substr_rc = - std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp"); + let substr_rc = &self.seq_rc[rpos..rpos+ksize]; // Get the next hash value from the hasher let hashval = self.hasher.next().expect("should not run out of hashes"); @@ -851,7 +855,7 @@ impl Iterator for KmersAndHashesIter { } else { substr_rc }; - // If vaild hash, return (canonical_kmer,hashval) tuple + // If valid hash, return (canonical_kmer,hashval) tuple Some(Ok((canonical_kmer.to_string(), hashval))) } else { // If the hash is 0, handle based on `skip_bad_kmers`