diff --git a/src/base/pileup.rs b/src/base/pileup.rs index 6c4716e..7090da4 100644 --- a/src/base/pileup.rs +++ b/src/base/pileup.rs @@ -2,6 +2,7 @@ use crate::base::*; use ndarray::prelude::*; +use std::collections::HashSet; use std::fs::{File, OpenOptions}; use std::io::{self, prelude::*, BufReader, BufWriter, Error, ErrorKind, SeekFrom}; use std::str; @@ -274,6 +275,18 @@ impl Filter for PileupLine { return Err(Error::new(ErrorKind::Other, "Filtered out.")); } } + + // Remove monoallelic loci (each loci must have coverage of at least 2 alleles) + if filter_stats.remove_monoallelic { + let mut covered_alleles = HashSet::new(); + + for pool in &self.read_codes{ + for read in pool{ + covered_alleles.insert(read); + } + } + + if covered_alleles.len() < 2{ // Filter by minimum allele frequency, //// First convert the counts per pool into frequencies let allele_frequencies = match self.to_frequencies() { diff --git a/src/base/structs_and_traits.rs b/src/base/structs_and_traits.rs index aa64216..d91e499 100644 --- a/src/base/structs_and_traits.rs +++ b/src/base/structs_and_traits.rs @@ -68,6 +68,7 @@ pub struct FileSyncPhen { #[derive(Debug, Clone)] pub struct FilterStats { pub remove_ns: bool, + pub remove_monoallelic: bool, pub max_base_error_rate: f64, pub min_coverage: u64, pub min_allele_frequency: f64, diff --git a/src/main.rs b/src/main.rs index 76461df..2bac377 100644 --- a/src/main.rs +++ b/src/main.rs @@ -52,6 +52,9 @@ struct Args { /// Keep ambiguous reads during SNP filtering, i.e. keep them coded as Ns #[clap(long, action)] keep_ns: bool, + /// Remove monoallelic loci (each loci must have coverage of at least 2 alleles) + #[clap(long, action)] + remove_monoallelic: bool, /// Input phenotype file: csv or tsv or any delimited file #[clap(short, long)] phen_fname: String, @@ -186,6 +189,7 @@ fn main() { let phen = file_phen.lparse().unwrap(); let filter_stats = base::FilterStats { remove_ns: !args.keep_ns, + remove_monoallelic: args.remove_monoallelic, max_base_error_rate: args.max_base_error_rate, min_coverage: args.min_coverage, min_allele_frequency: args.min_allele_frequency,