diff --git a/Cargo.toml b/Cargo.toml index ed34de7..33eb6d3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "abc" -version = "0.3.1" +version = "0.3.3" authors = ["Emanuel Schmid-Siegert"] description = """ The Agnostic Bam Counter (abc) determines at a given positions the count of observed nucleotides. diff --git a/README.md b/README.md index ae82ffd..7bd94c8 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The Agnostic Bam Counter (abc) determines at a given positions the count of obse Simply supply a bed file with positions and obtains a tsv file with counts for each ATCG and reference if provided ``` -abc 0.3.1 +abc 0.3.2 Emanuel Schmid-Siegert The Agnostic Bam Counter (abc) determines at a given positions the count of observed nucleotides. Simply supply a bed file with positions and obtains a tsv file with counts for each ATCG and reference if provided. @@ -42,26 +42,25 @@ OPTIONS: ## Limitations - the bed-file currently accepts only single nucleotide positions, no ranges. - - systematic testing missing so far + - systematic testing is not complete ## Output Example: ``` -## abc: 0.2.2 +## abc: 0.3.2 ## author: Emanuel Schmid-Siegert -## date: Sat, 11 Sep 2021 07:35:03 +0200 -## command: target/release/abc --bam test/test_mpileup.bam --positions test/test_mpileup.bed --outfile test/test_mpileup.out.tsv --reference test/test_ref.fa +## date: Thu, 03 Aug 2023 06:57:33 +0200 +## command: target/debug/abc --bam test/test_mpileup.bam --positions test/test_mpileup.bed --outfile test/test_mpileup.out.tsv --reference test/test_ref.fa # chromosome start end reference A T C G ambigious ins del depthVAF RAF -17 301 302 T 1 8 0 0 0 8 0 17 0.5294 0.4706 +21 10405200 10405201 C 0 2 55 0 0 0 0 57 0.03510.9649 +21 10402985 10402986 A 340 1 2 0 0 0 0 343 0.00580.9913 +17 1884 1885 C 0 0 20 0 0 0 0 20 0.0000 1.0000 +17 1870 1871 C 0 0 18 0 0 0 0 18 0.0000 1.0000 +17 1869 1870 C 0 0 18 0 0 0 0 18 0.0000 1.0000 +17 303 304 C 0 0 17 0 0 0 0 17 0.0000 1.0000 17 302 303 G 0 0 0 17 0 0 0 17 0.0000 1.0000 -17 827 828 T 0 2 11 0 0 0 0 13 0.8462 0.1538 -17 1868 1869 A 11 7 0 0 0 0 0 18 0.3889 0.6111 -17 2040 2041 G 13 0 0 10 0 0 0 23 0.5652 0.4348 -21 10402805 10402806 A 87 0 0 0 0 0 0 87 0.00001.0000 -21 10402975 10402976 C 2 2 333 0 0 0 0 337 0.01190.9881 - ``` The header contains the version and author of the program as well as the execution time and the used command. @@ -79,4 +78,9 @@ We have then in the table the following columns: - number of reads with an insertion - number of reads with a deletion at that position - the depth at that position - - information if mutated or not (true only possible with reference) + - VAF = variant allele frequency + - RAF = reference allele frequency + + Important: VAF and RAF are taking ** all ** modifications into account , as well insertion and deletions. + It can happen that a VAF reported is actually the insertion or deletion event. + diff --git a/changelog.txt b/changelog.txt new file mode 100644 index 0000000..44228e0 --- /dev/null +++ b/changelog.txt @@ -0,0 +1,6 @@ +# v0.3.3 +- hotfix as del was not correctly counted for depth +# v0.3.2 +- separate functions into lib +- simplify analysis of RAF&VAF +- calculate VAF based on dominant mutation only \ No newline at end of file diff --git a/src/libs.rs b/src/libs.rs new file mode 100644 index 0000000..792ebf0 --- /dev/null +++ b/src/libs.rs @@ -0,0 +1,500 @@ +use std::collections::HashMap; +use std::path::Path; +use rust_htslib::{bam, bam::Read}; +use std::str::from_utf8; +use bio::io::fasta::IndexedReader; +use chrono::{DateTime, Local}; +use std::error; +use itertools::Itertools; +extern crate bambam; +use std::fs; +use crossbeam::channel::{unbounded}; +use log::debug; + + + + +// Change the alias to `Box`. +type Result = std::result::Result>; + + + + + + +#[derive(Debug,Default,Clone)] +/// structure which holds our +/// counts of nucleotides for +/// any given genomic position +pub struct Pileup { + /// chromosome + chromosome: String, + /// start is 0 based similar to BED + start: u64, + /// end is 0 based + end: u64, + /// the reference nuc if + /// available through provided + /// reference FASTA + reference: String , + /// number of observe adenines + nuc_a: u32, + /// number of observe thyosins + nuc_t: u32, + /// number of observe cytosins + nuc_c: u32, + /// number of observe guanins + nuc_g: u32, + /// number of observe ambigious + /// elements, e.g. N or X + ambigious: u32, + /// number of reads with insertion + ins: u32, + /// number of reads with deletion + del: u32, + /// depth at that given position + depth: u32, + /// mutated or not + mutated: bool, + /// sum of variant frequency (sum/depth) + vaf: f32, + //// reference allele frequency (ref/depth) + raf: f32, +} + +// takes the generates pileups in our specific +// format and prints them according to our wishes +// currently only tsv, might add more possibilities +// in the future +pub fn print_pileup( + result:&[Pileup], + out: &str, + version: &str, + author: &str , + command: &str +)-> Result { + let now: DateTime = Local::now(); + let mut writer = csv::WriterBuilder::new().delimiter(b'\t').from_path(&out)?; + writer.write_record(&["##","abc:",version,"","","","","","","","","","",""])?; + writer.write_record(&["##","author:",author,"","","","","","","","","","",""])?; + writer.write_record(&["##","date:",&now.to_rfc2822(),"","","","","","","","","","",""])?; + writer.write_record(&["##","command:", command,"","","","","","","","","","",""])?; + writer.write_record(&["# chromosome","start","end","reference","A","T","C","G","ambigious","ins","del","depth","VAF","RAF"])?; + for element in result.iter() { + writer.write_record(&[ + element.chromosome.clone(), + element.start.to_string(), + element.end.to_string(), + element.reference.clone(), + element.nuc_a.to_string(), + element.nuc_t.to_string(), + element.nuc_c.to_string(), + element.nuc_g.to_string(), + element.ambigious.to_string(), + element.ins.to_string(), + element.del.to_string(), + element.depth.to_string(), + format!("{:.4}",element.vaf), + format!("{:.4}",element.raf), + ])?; + }; + writer.flush()?; + Ok(0) +} + + +// this simply takes a BED file and organizes entries in a Hashmap +// where they are organized by chromosome which should speedup +// later the indexed access +pub fn parse_bed_file ( + input: &str +) -> HashMap>{ + assert!( + Path::new(input).exists(), + "ERROR: BED file {} does not exist!", + input, + ); + let mut bed_result : HashMap> = HashMap::new(); + let mut bed = bio::io::bed::Reader::from_file(input).unwrap(); + for entry in bed.records(){ + let record = entry.expect("ERROR: wrong BED record"); + if record.end()-record.start()>1 { + panic!("ERROR: entry {:?} contained not a position but a range!",&record); + } + bed_result + .entry(record.chrom().to_string()) + .or_insert_with(Vec::new) + .push(record.start()); + + }; + for (_, entries) in bed_result.iter_mut(){ + entries.sort_unstable(); + entries.dedup(); + } + bed_result +} + +// This reads now the BAM file and uses the positions +// to check for each position the observed number of nucleotides +// It sorts as well again by chromosome and position to be safe +pub fn analyze_bam_positions ( + input: &str , + positions: &HashMap>, + ref_fasta: &str, + threads: &usize +) -> Vec { + assert!( + Path::new(input).exists(), + "ERROR: BAM file {} does not exist!", + input, + ); + // for the threads we need to have to first clone everything + // otherwise the "scope" part cant share the common variables + let input_local = &input.to_owned(); + let positions_local = positions.clone(); + // less ideal but we have thanks to clap a reference + // but want to avoid double referencing, therefore the + // work-around here + let ref_local = <&str>::clone(&ref_fasta); + //let cpus = 2; + //let n_chrom =positions_local.len(); + //let chunk_len = (n_chrom as u32/threads) as usize + 1 ; + //eprintln!("INFO: Foundchromosomes: {} divided by threads {}", n_chrom, chunk_len); + //let reference_local = ref_fasta.to_owned(); + let thread_results : Vec ; + let (snd, rxv) = unbounded(); + rayon::ThreadPoolBuilder::new().num_threads(*threads).build_global().unwrap(); + eprintln!("Current thread-pool number: {:?}",rayon::current_num_threads()); + // now we launch for each position a new + // pileup call and send the result to the receiver + rayon::scope( |child| { + for chr in positions_local.keys().sorted() { + for pos in positions_local.get(chr).unwrap().iter().sorted(){ + // cloning the sender will still send everything into same receiver + let snd_local = snd.clone(); + // now we define how many threads should be used + child.spawn( move |_| { + //eprintln!("Current thread index: {:?}",rayon::current_thread_index()); + let mut bam_file = bam::IndexedReader::from_path(input_local).unwrap(); + let tmp_result = match ref_local { + "NONE" => { + fetch_position(&mut bam_file,chr,pos,None,false) + } + , + _ => { + let faidx = IndexedReader::from_file(&ref_local).unwrap(); + fetch_position(&mut bam_file,chr,pos,Some(faidx),false) + } + }; + snd_local.send(tmp_result).expect("ERROR: thread could not communicate result!"); + }); + } + } + + }); + // we need to close afterwards + // the original sender as it otherwise still waits + drop(snd); + thread_results = rxv.iter().collect(); + thread_results +} + +// this sub-function gets the iterator result from the positions +// and return the result of pileup analysis at that given position +// allows to potentially already parallelize over each position analyzed +fn fetch_position( + bam: & mut bam::IndexedReader, + chr: &str, + pos:&u64, + ref_file: Option>, + _legacy: bool, +) -> Pileup { + // NOTE: + // SAM is 1 based and not 0 based, need to correct for that in + let start = *pos ; + let end = *pos +1 ; + let mut with_ref = false; + // this obtains now the pileup at that + // given position + bam.fetch((chr,start,end)).expect("ERROR: could not fetch region"); + // currently we ignore completely clipping + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + match ref_file { + Some(mut x) => { + with_ref = true; + x.fetch(chr,*pos,pos+1).expect("ERROR: could not fetch interval on reference "); + let mut ref_seq = Vec::new(); + x.read(&mut ref_seq).expect("ERROR: could not read sequence from reference"); + collection.reference = String::from(from_utf8(&ref_seq).unwrap()) + } + None => {collection.reference = String::from("NA"); collection.mutated = false } + } + collection.chromosome = chr.to_string(); + collection.start = start; + collection.end = end; + for pile in bam.pileup().map(|x| x.expect("ERROR: could not parse BAM file")){ + // now we only care about the position we inquire + //dbg!(&pile); + if pile.pos() as u64 == start { + collection.depth = pile.depth(); + for alignment in pile.alignments(){ + // some have none, no idea why. + // additionally the is Del from indel is not working as anticipated + // the is_del works instead though... + if alignment.is_del() { + collection.del += 1; + }else{ + match alignment.indel() { + bam::pileup::Indel::Ins(_len) => {collection.ins += 1; continue } , + bam::pileup::Indel::Del(_len) => {collection.del += 1; continue }, + bam::pileup::Indel::None => () + } + } + + // sometimes we get reads with 0 length, no idea why + if alignment.record().seq_len() == 0 { continue } + let qpos = match alignment.qpos(){ + Some(q) => q, + _ => continue, + }; + let nuc = &alignment.record().seq().as_bytes()[qpos..qpos+1]; + let nuc1 = from_utf8(nuc).unwrap(); + match nuc1.to_uppercase().as_str() { + "A" => collection.nuc_a += 1 , + "T" => collection.nuc_t += 1 , + "C" => collection.nuc_c += 1 , + "G" => collection.nuc_g += 1 , + _ => collection.ambigious += 1 , + } + } + break; + } + }; + if with_ref { + // if legacy { + // eval_mutation_legacy(collection) + // }else{ + eval_mutation(collection) + //} + }else{ + collection + } +} + + +/// this function will take a pileup information in our native +/// format and evaluate the necessary seeked information. +/// It then returns the pileup information with additional +/// filled fields +/// Steps: +/// - build hash with observations +/// - identify reference nucleotide => RAF +/// - remove element which is ref +/// - identify element with most counts +/// - calculate VAF +fn eval_mutation( + mut obs:Pileup +) -> Pileup{ + debug!("{:?}",obs); + let mut dominant_list : HashMap<&str,u32> = + [ + ("A", obs.nuc_a), + ("T", obs.nuc_t), + ("C", obs.nuc_c), + ("G", obs.nuc_g), + ("N", obs.ambigious), + ("I", obs.ins), + ("D", obs.del) + ] + .iter() + .cloned() + .collect(); + // check that the depth equals sum of observations + if obs.nuc_a + obs.nuc_t + obs.nuc_c + obs.nuc_g + obs.ambigious + obs.ins + obs.del != obs.depth { + debug!("A {} T {} C{} G {} N {} INS{} DEL {} Depth {}",obs.nuc_a,obs.nuc_t,obs.nuc_c,obs.nuc_g,obs.ambigious,obs.ins,obs.del,obs.depth ); + panic!("ERROR: the sum of observations does not equal depth !"); + } + match obs.reference.to_uppercase().as_str(){ + "A" => { + if obs.nuc_a != obs.depth { + // get ref nuc + let original = *dominant_list.get("A").expect("ERROR: A did not exist!!") as f32; + // remove ref nuc + dominant_list.remove("A"); + let depth = (obs.depth) as f32; + // calc raf + obs.raf = original/depth; + // next we want to get most dominant alternative mutated nucleotid + let max_nuc = dominant_list + .iter() + .max_by_key(|entry| entry.1).expect("ERROR: no element in nuc hash!"); + // calc vaf of dominant second (if tied, still correct) + obs.vaf = *max_nuc.1 as f32/depth; + // define as being non-reference observation + obs.mutated = true; + } + }, + "T" => { + if obs.nuc_t != obs.depth { + let original = *dominant_list.get("T").expect("ERROR: A did not exist!!") as f32; + dominant_list.remove("T"); + let depth = (obs.depth) as f32; + obs.raf = original/depth; + // next we want to get most dominant alternative mutated nucleotide + let max_nuc = dominant_list + .iter() + .max_by_key(|entry| entry.1).expect("ERROR: no element in nuc hash!"); + obs.vaf = *max_nuc.1 as f32/depth; + obs.mutated = true; + } + }, + "C" => { + if obs.nuc_c != obs.depth { + let original = *dominant_list.get("C").expect("ERROR: A did not exist!!") as f32; + dominant_list.remove("C"); + let depth = (obs.depth) as f32; + obs.raf = original/depth; + // next we want to get most dominant alternative mutated nucleotide + let max_nuc = dominant_list + .iter() + .max_by_key(|entry| entry.1).expect("ERROR: no element in nuc hash!"); + obs.vaf = *max_nuc.1 as f32/depth; + obs.mutated = true; + } + }, + "G" => { + if obs.nuc_g != obs.depth { + let original = *dominant_list.get("G").expect("ERROR: A did not exist!!") as f32; + dominant_list.remove("G"); + let depth = (obs.depth) as f32; + obs.raf = original/depth; + // next we want to get most dominant alternative mutated nucleotide + let max_nuc = dominant_list + .iter() + .max_by_key(|entry| entry.1).expect("ERROR: no element in nuc hash!"); + obs.vaf = *max_nuc.1 as f32/depth; + obs.mutated = true; + }else{ + obs.mutated = false; + obs.vaf = 0.0_f32; + obs.raf = 1.0_f32; + } + }, + "N" => (), + _ => {panic!("ERROR: non compatible base found in reference sequence: {}!",obs.reference.as_str())}, + } + + obs +} + +#[cfg(test)] +mod tests { + // Note this useful idiom: importing names from outer (for mod tests) scope. + use super::*; + + #[test] + /// test for a 100% reference observations + fn test_eva_mut_1() { + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + collection.reference = String::from("A"); + collection.nuc_a = 10; + collection.depth = 10; + let result = eval_mutation(collection); + assert_eq!(result.mutated,false); + assert_eq!(result.raf,1.0_f32); + assert_eq!(result.vaf,0.0_f32); + + } + + #[test] + /// test for a 100% mutation observations + fn test_eva_mut_2() { + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + collection.reference = String::from("A"); + collection.nuc_t = 10; + collection.depth = 10; + let result = eval_mutation(collection); + assert_eq!(result.mutated,true); + assert_eq!(result.raf,0.0_f32); + assert_eq!(result.vaf,1.0_f32); + + } + + + #[test] + /// test for a 50% mutation observations + fn test_eva_mut_3() { + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + collection.reference = String::from("A"); + collection.nuc_t = 10; + collection.nuc_a = 10; + collection.depth = 20; + let result = eval_mutation(collection); + assert_eq!(result.mutated,true); + assert_eq!(result.raf,0.5_f32); + assert_eq!(result.vaf,0.5_f32); + + } + + + #[test] + /// test for a 25% multi allelic mutation observations, 2 with equal proportion + fn test_eva_mut_4() { + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + collection.reference = String::from("A"); + collection.nuc_t = 5; + collection.nuc_c = 5; + collection.nuc_a = 10; + collection.depth = 20; + let result = eval_mutation(collection); + assert_eq!(result.mutated,true); + assert_eq!(result.raf,0.5_f32); + assert_eq!(result.vaf,0.25_f32); + + } + + + #[test] + /// test for a 25% multi allelic mutation observations, 3 with unequal proportion + fn test_eva_mut_5() { + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + collection.reference = String::from("A"); + collection.nuc_t = 2; + collection.nuc_g = 3; + collection.nuc_c = 5; + collection.nuc_a = 10; + collection.depth = 20; + let result = eval_mutation(collection); + assert_eq!(result.mutated,true); + assert_eq!(result.raf,0.5_f32); + assert_eq!(result.vaf,0.25_f32); + + } + + #[test] + #[should_panic(expected = "ERROR: the sum of observations does not equal depth !")] + /// test that fails if depth is unequal to depth + fn test_eva_mut_6() { + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + collection.reference = String::from("A"); + collection.nuc_a = 10; + collection.depth = 20; + eval_mutation(collection); + } + + #[test] + /// test if indels are present + fn test_eva_mut_7() { + let mut collection : Pileup = Pileup { vaf: 0_f32, raf: 1_f32, ..Default::default() }; + collection.reference = String::from("A"); + collection.nuc_a = 10; + collection.depth = 20; + collection.del = 10; + let result = eval_mutation(collection); + assert_eq!(result.mutated,true); + assert_eq!(result.raf,0.5_f32); + assert_eq!(result.vaf,0.5_f32); + } + + +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 9ed24f7..42d1c9a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,13 +1,6 @@ use std::collections::HashMap; -use std::path::Path; -use rust_htslib::{bam, bam::Read}; -use std::str::from_utf8; -use bio::io::fasta::IndexedReader; -use chrono::{DateTime, Local}; -use std::error; use std::process; use std::env; -use itertools::Itertools; extern crate bambam; use std::fs; use crossbeam::channel::{unbounded}; @@ -339,6 +332,8 @@ fn eval_mutation( } fn main() { + env_logger::init(); + // now the next is not really for any argument // parsing but simply to get the command which // was used to execute as I cant get this from clap diff --git a/test/test_mpileup.bed b/test/test_mpileup.bed index 7438564..717d420 100644 --- a/test/test_mpileup.bed +++ b/test/test_mpileup.bed @@ -5,3 +5,7 @@ 17 1869 1869 17 1870 1870 17 1884 1884 +21 10402953 10402953 +17 3010 3010 +17 3009 3009 +21 10402952 10402952 diff --git a/test/test_mpileup_1.bed b/test/test_mpileup_1.bed new file mode 100644 index 0000000..82b5ceb --- /dev/null +++ b/test/test_mpileup_1.bed @@ -0,0 +1 @@ +21 10402985 10402985