From 32d04b338dcef014374405e8e96a30427a1551f3 Mon Sep 17 00:00:00 2001 From: ebioman Date: Mon, 23 Sep 2024 20:16:55 +0200 Subject: [PATCH] minor changes --- .devcontainer/devcontainer.json | 31 +++++++ Cargo.toml | 17 ++-- src/main.rs | 143 +++++++++++++++++++++----------- 3 files changed, 135 insertions(+), 56 deletions(-) create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..95e8ba7 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,31 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/rust +{ + "name": "Rust", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "mcr.microsoft.com/devcontainers/rust:0-1-bullseye", + + // Use 'mounts' to make the cargo cache persistent in a Docker Volume. + // "mounts": [ + // { + // "source": "devcontainer-cargo-cache-${devcontainerId}", + // "target": "/usr/local/cargo", + // "type": "volume" + // } + // ] + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "sudo apt-get update && sudo apt-get install -y cmake" + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/Cargo.toml b/Cargo.toml index e1429de..ed34de7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,6 @@ name = "abc" version = "0.3.1" authors = ["Emanuel Schmid-Siegert"] -edition = "2018" description = """ The Agnostic Bam Counter (abc) determines at a given positions the count of observed nucleotides. Simply supply a bed file with positions and obtains a tsv file with counts for each ATCG and reference if provided. @@ -18,14 +17,14 @@ Note: # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -rust-htslib = { version = "*", default-features = false } +rust-htslib = "0.40.2" #regex = "1" -clap = "*" -csv = "*" -chrono = "*" -bio = "*" +clap = "4.1.6" +csv = "1.2.0" +chrono = "0.4.23" +bio = "1.1.0" itertools = "*" bambam = {git = "https://github.com/ebioman/bambam"} -crossbeam = "*" -crossbeam-channel = "*" -rayon = "*" +crossbeam = "0.8.2" +crossbeam-channel = "0.5.6" +rayon = "1.6.1" diff --git a/src/main.rs b/src/main.rs index 15bbbaf..9ed24f7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,3 @@ -use clap::{app_from_crate,crate_name,crate_description,crate_authors,crate_version,Arg}; use std::collections::HashMap; use std::path::Path; use rust_htslib::{bam, bam::Read}; @@ -12,11 +11,15 @@ use itertools::Itertools; extern crate bambam; use std::fs; use crossbeam::channel::{unbounded}; -use rayon; - - - +use clap::{arg, Command,Arg}; +extern crate rust_htslib; +extern crate clap; +extern crate rayon; +extern crate itertools; +extern crate chrono; +extern crate crossbeam; +extern crate bio; // Change the alias to `Box`. type Result = std::result::Result>; @@ -40,41 +43,54 @@ struct Pileup { /// reference FASTA reference: String , /// number of observe adenines - nuc_a: i32, + nuc_a: u32, /// number of observe thyosins - nuc_t: i32, + nuc_t: u32, /// number of observe cytosins - nuc_c: i32, + nuc_c: u32, /// number of observe guanins - nuc_g: i32, + nuc_g: u32, /// number of observe ambigious /// elements, e.g. N or X - ambigious: i32, + ambigious: u32, /// number of reads with insertion - ins: i32, + ins: u32, /// number of reads with deletion - del: i32, + del: u32, /// depth at that given position depth: u32, /// mutated or not mutated: bool, - /// sum of variant frequency (sum/depth) + /// variant frequency (sum/depth) vaf: f32, //// reference allele frequency (ref/depth) raf: f32, } +#[derive(Debug,Default,Clone)] +// the information concerning the +// program which are organized and printed in the header +pub struct MetaInfo { + version: String, + author: String, + command: String, +} + // takes the generates pileups in our specific // format and prints them according to our wishes // currently only tsv, might add more possibilities // in the future -fn print_pileup(result:&[Pileup],out: &str, version: &str, author: &str , command: &str )-> Result { +fn print_pileup( + result:&[Pileup], + out: &str, + infos: MetaInfo, +)-> Result { let now: DateTime = Local::now(); let mut writer = csv::WriterBuilder::new().delimiter(b'\t').from_path(&out)?; - writer.write_record(&["##","abc:",version,"","","","","","","","","","",""])?; - writer.write_record(&["##","author:",author,"","","","","","","","","","",""])?; + writer.write_record(&["##","abc:",infos.version,"","","","","","","","","","",""])?; + writer.write_record(&["##","author:",infos.author,"","","","","","","","","","",""])?; writer.write_record(&["##","date:",&now.to_rfc2822(),"","","","","","","","","","",""])?; - writer.write_record(&["##","command:", command,"","","","","","","","","","",""])?; + writer.write_record(&["##","command:", infos.command,"","","","","","","","","","",""])?; writer.write_record(&["# chromosome","start","end","reference","A","T","C","G","ambigious","ins","del","depth","VAF","RAF"])?; for element in result.iter() { writer.write_record(&[ @@ -102,7 +118,9 @@ fn print_pileup(result:&[Pileup],out: &str, version: &str, author: &str , comman // this simply takes a BED file and organizes entries in a Hashmap // where they are organized by chromosome which should speedup // later the indexed access -fn parse_bed_file (input: &str) -> HashMap>{ +fn parse_bed_file ( + input: &str +) -> HashMap>{ assert!( Path::new(input).exists(), "ERROR: BED file {} does not exist!", @@ -113,7 +131,14 @@ fn parse_bed_file (input: &str) -> HashMap>{ for entry in bed.records(){ let record = entry.expect("ERROR: wrong BED record"); if record.end()-record.start()>1 { - panic!("ERROR: entry {:?} contained not a position but a range!",&record); + eprintln!("WARNING: entry {:?} contained not a position but a range!",&record); + for i in record.end()..record.start { + bed_result + .entry(record.chrom().to_string()) + .or_insert_with(Vec::new) + .push(i); + + } } bed_result .entry(record.chrom().to_string()) @@ -131,7 +156,12 @@ fn parse_bed_file (input: &str) -> HashMap>{ // This reads now the BAM file and uses the positions // to check for each position the observed number of nucleotides // It sorts as well again by chromosome and position to be safe -fn analyze_bam_positions (input: &str , positions: &HashMap>, ref_fasta: &str, threads: &usize) -> Vec{ +fn analyze_bam_positions ( + input: &str , + positions: &HashMap>, + ref_fasta: &str, + threads: &usize +) -> Vec{ assert!( Path::new(input).exists(), "ERROR: BAM file {} does not exist!", @@ -191,7 +221,12 @@ fn analyze_bam_positions (input: &str , positions: &HashMap>, re // this sub-function gets the iterator result from the positions // and return the result of pileup analysis at that given position // allows to potentially already parallelize over each position analyzed -fn fetch_position(bam: & mut bam::IndexedReader, chr: &str, pos:&u64, ref_file: Option>) -> Pileup { +fn fetch_position( + bam: & mut bam::IndexedReader, + chr: &str, + pos:&u64, + ref_file: Option> +) -> Pileup { // NOTE: // SAM is 1 based and not 0 based, need to correct for that in let start = *pos ; @@ -253,7 +288,9 @@ fn fetch_position(bam: & mut bam::IndexedReader, chr: &str, pos:&u64, ref_file: } } -fn eval_mutation(mut obs:Pileup) -> Pileup{ +fn eval_mutation( + mut obs:Pileup +) -> Pileup{ match obs.reference.to_uppercase().as_str(){ "A" => { if ( obs.nuc_c !=0 ) || (obs.nuc_g != 0) || (obs.nuc_t != 0) || (obs.del !=0) || (obs.ins !=0) { @@ -307,51 +344,51 @@ fn main() { // was used to execute as I cant get this from clap let args: Vec = env::args().collect(); let args_string = args.join(" "); - let matches = app_from_crate!() - .arg(Arg::with_name("BAM") - .short("b") + let matches = Command::new("abc") + .arg(Arg::new("BAM") + .short('b') .long("bam") .value_name("BAM") .help("aligned short or long reads") - .takes_value(true) + //.takes_values(true) .required(true)) - .arg(Arg::with_name("POSITIONS") - .short("p") + .arg(Arg::new("POSITIONS") + .short('p') .long("positions") .value_name("BED") .help("A bed file with positions") - .takes_value(true) + //.takes_value(true) .required(true)) - .arg(Arg::with_name("OUT") - .short("o") + .arg(Arg::new("OUT") + .short('o') .long("outfile") .value_name("TSV") .help("the output file which will be in tsv format") - .takes_value(true) + //.takes_value(true) .required(true)) - .arg(Arg::with_name("REF") - .short("r") + .arg(Arg::new("REF") + .short('r') .long("reference") .value_name("FASTA") .help("if reference in fasta format provided, reference nucleotide \ \nis provided for each positions, otherwise NA") - .takes_value(true) + //.takes_value(true) .required(false)) - .arg(Arg::with_name("THREADS") - .short("t") + .arg(Arg::new("THREADS") + .short('t') .long("threads") .value_name("INT") .help("number of threads used in thread-pool for querying positions ") .default_value("1") - .takes_value(true) + //.takes_value(true) .required(false)) - .arg(Arg::with_name("BAMBAM") - .short("x") + .arg(Arg::new("BAMBAM") + .short('x') .long("bambam") .value_name("IDH") - .takes_value(false) + //.takes_value(false) .required(false) - .hidden(true)) + .hide(true)) .get_matches(); // prepare input or quit @@ -361,18 +398,30 @@ fn main() { let ref_file = matches.value_of("REF").unwrap_or("NONE"); let bam_threads = matches.value_of("THREADS").unwrap().parse::().unwrap(); - if matches.is_present("BAMBAM") { + if matches.args_present("BAMBAM") { bambam::bam_bam_inda_house(); } - let positions : HashMap> = parse_bed_file(bed_file); - let analysis_result = analyze_bam_positions(bam_file,&positions, ref_file, &bam_threads); + + let positions : HashMap> = parse_bed_file( + bed_file + ); + + let analysis_result = analyze_bam_positions( + bam_file,&positions, + ref_file, + &bam_threads + ); + let infos = MetaInfo{ + version: env!("CARGO_PKG_VERSION").to_string(), + author: env!("CARGO_PKG_AUTHORS").to_string(), + command: args_string, + }; // here we box the error, so in case the writing does // not work we get a return of the error from the process back if let Err(err) = print_pileup( &analysis_result, out_file, - crate_version!(), - crate_authors!(),&args_string + infos, ){ eprintln!("{}", err); process::exit(1);