Skip to content

Commit

Permalink
Added time measurements
Browse files Browse the repository at this point in the history
  • Loading branch information
aljpetri committed Jan 16, 2024
1 parent 40be211 commit 634b3f4
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 7 deletions.
4 changes: 2 additions & 2 deletions src/clustering.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ pub(crate) fn cluster_de_novo(sorted_entries: Vec<(i32,Vec<Minimizer_hashed>)>,m
//cluster_map contains a hashmap in which we have a hash_value for a minimizer as key and a vector of read ids as a value
let mut cluster_map: HashMap<u64, Vec<i32>> = HashMap::new();
//we only need cl_id if cluster 0 already exists so we start with '1'
let mut cl_id=1;
let mut cl_id= 1;
let shared_perc_mini=min_shared_minis/2.0_f64;
//TODO: This can be heavily improved if I add a field, high_confidence, to the seed object (here minimizer) We then can only pass over the minis with high_confidence=false
//entry represents a read in our data
Expand All @@ -89,7 +89,7 @@ pub(crate) fn cluster_de_novo(sorted_entries: Vec<(i32,Vec<Minimizer_hashed>)>,m
let id = entry.0;
let sign_minis= &entry.1;
//println!("id: {}",id);
let minimizers=minimizer_hashmap.get(&id).unwrap();
let minimizers= minimizer_hashmap.get(&id).unwrap();
//if we already have at least one cluster: compare the new read to the cluster(s)
if clusters.len() > 0{
let mut mini_hashs_vec=vec![];
Expand Down
8 changes: 4 additions & 4 deletions src/file_actions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ pub(crate) fn parse_fastq(file: File) -> Result<Vec<structs::FastqRecord_isoncl_
let mut id_int=0;
loop {
let mut header = String::new();
let header_read = reader.read_line(&mut header)?;
let header_read = reader.read_line(&mut header).expect("Should be contained");
if header_read == 0 {
break;
}
Expand All @@ -156,21 +156,21 @@ pub(crate) fn parse_fastq(file: File) -> Result<Vec<structs::FastqRecord_isoncl_
header = (&header[1..]).to_string();
header = shorten_header(&header).parse().unwrap();
let mut sequence = String::new();
let sequence_read = reader.read_line(&mut sequence)?;
let sequence_read = reader.read_line(&mut sequence).expect("Should be contained");
if sequence_read == 0 {
break;
}
sequence = sequence.trim().to_owned();

let mut quality_header = String::new();
let quality_header_read = reader.read_line(&mut quality_header)?;
let quality_header_read = reader.read_line(&mut quality_header).expect("Should be contained");
if quality_header_read == 0 {
break;
}
quality_header = quality_header.trim().to_owned();

let mut quality = String::new();
let quality_read = reader.read_line(&mut quality)?;
let quality_read = reader.read_line(&mut quality).expect("Should be contained");
if quality_read == 0 {
break;
}
Expand Down
111 changes: 110 additions & 1 deletion src/generate_sorted_fastq_new_version.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ pub fn get_canonical_kmer_minimizers_hashed(seq: &str, k_size: usize, w_size: us
else {
window_kmers.push_back((reverse_hash,new_kmer_pos))
}

// Find the new minimizer, we need a ds that was cloned from window_kmers to abide ownership rules in rust
binding = window_kmers.clone();
let (curr_min, min_pos) = binding.iter().min_by_key(|&(kmer, _)| kmer).unwrap().clone();
Expand All @@ -267,6 +266,116 @@ pub fn get_canonical_kmer_minimizers_hashed(seq: &str, k_size: usize, w_size: us
}
minimizers
}
/*
static seq_nt4_table: &'static [u8] = &[
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
]; //seq_nt4_table
fn make_string_to_hashvalues_random_minimizers(seq: &str, string_hashes: &mut Vec<u64>, pos_to_seq_choord: &mut Vec<u32>, k: usize, kmask: u64, w: usize) {
let mut q: VecDeque<u64> = VecDeque::new();
let mut q_pos: VecDeque<u32> = VecDeque::new();
let mut seq_iter = seq.chars();
let mut seq_length = seq.len();
let mut q_size = 0;
let mut q_min_val = u64::MAX;
let mut q_min_pos = -1;
let mut hash_count = 0;
let mut l = 0;
let mut x = 0;
while let Some(ch) = seq_iter.next() {
let c = seq_nt4_table[ch as usize];
if c < 4 {
x = (x << 2 | c as u64) & kmask;
if l >= k {
let hash_k = robin_hash(x);
if q_size < w {
q.push_back(hash_k);
q_pos.push_back((l - k as u32 + 1) as u32);
q_size += 1;
} else if q_size == w - 1 {
q.push_back(hash_k);
q_pos.push_back((l - k as u32 + 1) as u32);
q_min_val = u64::MAX;
q_min_pos = -1;
for (j, &val) in q.iter().enumerate() {
if val < q_min_val {
q_min_val = val;
q_min_pos = q_pos[j] as i32;
}
}
string_hashes.push(q_min_val);
pos_to_seq_choord.push(q_min_pos as u32);
hash_count += 1;
} else {
let mut new_minimizer = false;
update_window(&mut q, &mut q_pos, &mut q_min_val, &mut q_min_pos, hash_k, (l - k as u32 + 1) as u32, &mut new_minimizer);
if new_minimizer {
string_hashes.push(q_min_val);
pos_to_seq_choord.push(q_min_pos as u32);
hash_count += 1;
}
}
}
} else {
l = 0;
x = 0;
}
seq_length -= 1;
}
// println!("{} values produced from string of length {}", hash_count, seq.len());
// for t in pos_to_seq_choord {
// print!("{} ", t);
// }
// println!();
}
*/

fn update_window(q: &mut VecDeque<u64>, q_pos: &mut VecDeque<u32>, q_min_val: &mut u64, q_min_pos: &mut i32, new_strobe_hashval: u64, i: u32, new_minimizer: &mut bool) {
// uint64_t popped_val;
// popped_val = q.front();
q.pop_front();
let popped_index = q_pos.pop_front().unwrap();

q.push_back(new_strobe_hashval);
q_pos.push_back(i);

if *q_min_pos == popped_index as i32 {
// we popped the previous minimizer, find new brute force
*q_min_val = u64::MAX;
*q_min_pos = i as i32;
for j in (0..q.len()).rev() {
// Iterate in reverse to choose the rightmost minimizer in a window
if q[j] < *q_min_val {
*q_min_val = q[j];
*q_min_pos = q_pos[j] as i32;
*new_minimizer = true;
}
}
} else if new_strobe_hashval < *q_min_val {
// the new value added to the queue is the new minimum
*q_min_val = new_strobe_hashval;
*q_min_pos = i as i32;
*new_minimizer = true;
}
}

//calculates the average of a list of f64s and returns it as f64
fn average(numbers: &[f64]) -> f64 {
Expand Down
22 changes: 22 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@ use std::fs::File;
use std::collections::{HashMap, HashSet, VecDeque};
use rayon::prelude::*;
use std::time::Instant;
use std::time::Duration;
//use crate::generate_sorted_fastq_new_version::{filter_minimizers_by_quality, Minimizer,get_kmer_minimizers};
//use clap::{arg, command, Command};
use clap::Parser;


pub mod file_actions;
mod clustering;
//mod generate_sorted_fastq_for_cluster;
Expand Down Expand Up @@ -285,6 +288,8 @@ struct Cli {
n: usize,
#[arg(long, short,help="Path to gtf file (optional parameter)")]
gtf: Option<String>,
#[arg(long, short,help="Path to gtf file (optional parameter)")]
noncanonical: Option<bool>,
//TODO:add argument telling us whether we want to use syncmers instead of kmers, maybe also add argument determining whether we want to use canonical_minimizers

}
Expand All @@ -298,9 +303,17 @@ fn main() {
//
// READ the files (the initial_clusters_file as well as the fastq file containing the reads)
//
let now1 = Instant::now();
let fastq_file = File::open(cli.fastq).unwrap();
println!("{} s used for file input", now1.elapsed().as_secs());
//let initial_clustering_path = &cli.init_cl.unwrap_or_else(||{"".to_string()});
let initial_clustering_path =cli.init_cl.as_deref();
//let noncanonical= cli.noncanonical.as_deref();
//let mut noncanonical=false;
//if noncanonical.is_some(){
// noncanonical=true;
//}

let gtf_path = cli.gtf.as_deref();
//let mut initial_clustering_records: Vec<_>=vec![];
let mut init_clust_rec_both_dir=vec![];
Expand Down Expand Up @@ -372,6 +385,8 @@ fn main() {
} else {
println!("Couldn't get the current memory usage :(");
}
let now2 = Instant::now();
println!("{} s before minimizergen", now1.elapsed().as_secs());
for fastq_record in &fastq_records{
//println!("int id {}",int_id_cter);
id_map.insert(int_id_cter,(*fastq_record.header.clone()).to_string());
Expand All @@ -391,6 +406,7 @@ fn main() {
//println!("Read too short- skipped {}",fastq_record.header)
}
}
println!("{} s for minimizer gen and filtering of minis", now2.elapsed().as_secs());
println!("Skipped {} reads due to being too short",skipped_cter);
if let Some(usage) = memory_stats() {
println!("Current physical memory usage: {}", usage.physical_mem);
Expand All @@ -404,6 +420,7 @@ fn main() {
//Perform the clustering
//
let mut clusters:HashMap<i32,Vec<i32>> = HashMap::new();
let now3 = Instant::now();
//annotation based clustering
if init_clust_rec_both_dir.len() > 0{
let init_cluster_map= clustering::get_initial_clustering(init_clust_rec_both_dir,k,window_size);
Expand All @@ -421,12 +438,17 @@ fn main() {
//println!("{:?}",clusters);
//TODO: would it make sense to add a post_clustering? i.e. find the overlap between all clusters and merge if > min_shared_minis
}
println!("{} s for denovo clustering", now3.elapsed().as_secs());

if let Some(usage) = memory_stats() {
println!("Current physical memory usage: {}", usage.physical_mem);
println!("Current virtual memory usage: {}", usage.virtual_mem);
} else {
println!("Couldn't get the current memory usage :(");
}
println!("Finished clustering");
let now4 = Instant::now();
write_output::write_output(outfolder, clusters,fastq_records, id_map);
println!("{} s for file output", now4.elapsed().as_secs());
println!("{} overall runtime", now1.elapsed().as_secs());
}

0 comments on commit 634b3f4

Please sign in to comment.