From 6fa36d27d398f2fa677aa13836c57ce1a147c4f3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 27 Oct 2024 16:32:39 -0700 Subject: [PATCH 1/3] compute revcomp once, not every time --- src/lib.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4f639c1..1f968f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -778,8 +778,8 @@ impl KmerCountTableIterator { } pub struct KmersAndHashesIter { - seq: String, // The sequence to iterate over - seqb: Vec, // Sequence bytes + seq: String, // The sequence to iterate over + seq_rc: String, // reverse complement sequence ksize: usize, // K-mer size pos: usize, // Current position in the sequence end: usize, // The end position for k-mer extraction @@ -791,8 +791,12 @@ impl KmersAndHashesIter { pub fn new(seq: String, ksize: usize, skip_bad_kmers: bool) -> Self { let seq = seq.to_ascii_uppercase(); // Ensure uppercase for uniformity let seqb = seq.as_bytes().to_vec(); // Convert to bytes for hashing - let end = seq.len() - ksize + 1; // Calculate the endpoint for k-mer extraction + let seqb_rc = revcomp(&seqb); + let seq_rc = std::str::from_utf8(&seqb_rc) + .expect("invalid utf-8 sequence for rev comp") + .to_string(); + let end = seq.len() - ksize + 1; // Calculate the endpoint for k-mer extraction let hasher = SeqToHashes::new( &seqb, ksize.into(), @@ -804,7 +808,7 @@ impl KmersAndHashesIter { Self { seq, - seqb, + seq_rc, ksize, pos: 0, // Start at the beginning of the sequence end, @@ -825,15 +829,15 @@ impl Iterator for KmersAndHashesIter { let start = self.pos; let ksize = self.ksize; + let rpos = self.end - start - 1; // Extract the current k-mer and its reverse complement let substr = &self.seq[start..start + ksize]; + // let substr_rc = &self.seq_rc[start..start+ksize] // CTB: this calculates RC each time, instead of doing so // using a sliding window. It's easy and works, so I'm // starting here :). - let substr_b_rc = revcomp(&self.seqb[start..start + ksize]); - let substr_rc = - std::str::from_utf8(&substr_b_rc).expect("invalid utf-8 sequence for rev comp"); + let substr_rc = &self.seq_rc[rpos..rpos+ksize]; // Get the next hash value from the hasher let hashval = self.hasher.next().expect("should not run out of hashes"); @@ -851,7 +855,7 @@ impl Iterator for KmersAndHashesIter { } else { substr_rc }; - // If vaild hash, return (canonical_kmer,hashval) tuple + // If valid hash, return (canonical_kmer,hashval) tuple Some(Ok((canonical_kmer.to_string(), hashval))) } else { // If the hash is 0, handle based on `skip_bad_kmers` From 667610b4087a53ce0329c6eb83d0f06187ea53fb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 27 Oct 2024 17:23:39 -0700 Subject: [PATCH 2/3] rm comment --- src/lib.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1f968f3..9d08d59 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -833,10 +833,6 @@ impl Iterator for KmersAndHashesIter { // Extract the current k-mer and its reverse complement let substr = &self.seq[start..start + ksize]; - // let substr_rc = &self.seq_rc[start..start+ksize] - // CTB: this calculates RC each time, instead of doing so - // using a sliding window. It's easy and works, so I'm - // starting here :). let substr_rc = &self.seq_rc[rpos..rpos+ksize]; // Get the next hash value from the hasher From 31a9a2a536084858e53f964150c57dc8430022f4 Mon Sep 17 00:00:00 2001 From: Adam Taranto Date: Mon, 28 Oct 2024 14:16:10 +1100 Subject: [PATCH 3/3] run cargo fmt --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 9d08d59..cbded28 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -778,7 +778,7 @@ impl KmerCountTableIterator { } pub struct KmersAndHashesIter { - seq: String, // The sequence to iterate over + seq: String, // The sequence to iterate over seq_rc: String, // reverse complement sequence ksize: usize, // K-mer size pos: usize, // Current position in the sequence @@ -833,7 +833,7 @@ impl Iterator for KmersAndHashesIter { // Extract the current k-mer and its reverse complement let substr = &self.seq[start..start + ksize]; - let substr_rc = &self.seq_rc[rpos..rpos+ksize]; + let substr_rc = &self.seq_rc[rpos..rpos + ksize]; // Get the next hash value from the hasher let hashval = self.hasher.next().expect("should not run out of hashes");