Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: argument bounds checking and option filtering #25

Merged
merged 3 commits into from
Jul 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions src/base/filter_stats.rs
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. Thanks. Just keep in mind we'll need to redefine these ErrorKind's into some custom Error type which we can chain in the future.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I am going to do this right after. What do you mean by chaining errors? All I had in mind was a couple (2-4) error structs that encompassed most of the current "errors" and implementing unique display methods for each of them.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If one error caused another in a cascade, we need to chain them to identify the root cause. I have not looked too hard into implementing this manually, but I believe the anyhow crate implements this. You may use that crate for all the error handling-related stuff or you may implement things yourself from scratch.

Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use std::io::{self, Error, ErrorKind};

use crate::base::*;

impl Parse<FilterStats> for FilterStats {
fn lparse(&self) -> io::Result<Box<FilterStats>> {
let remove_ns = self.remove_ns.clone();
let remove_monoallelic = self.remove_monoallelic.clone();
let keep_lowercase_reference = self.keep_lowercase_reference.clone();
let max_base_error_rate = if self.max_base_error_rate <= 1.0 && self.max_base_error_rate >= 0.0 {
self.max_base_error_rate.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. max_base_error_rate must be between 0.0 and 1.0",
));
};
let min_coverage_depth = self.min_coverage_depth;
let min_coverage_breadth = if self.min_coverage_breadth <= 1.0 && self.max_base_error_rate >= 0.0 {
self.min_coverage_breadth.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. min_coverage_breadth must be between 0.0 and 1.0",
));
};
let min_allele_frequency = if self.min_allele_frequency <= 1.0 && self.min_allele_frequency >= 0.0 {
self.min_allele_frequency.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. min_allele_frequency must be between 0.0 and 1.0",
));
};
let max_missingness_rate = if self.max_missingness_rate <= 1.0 && self.max_missingness_rate >= 0.0 {
self.max_missingness_rate.clone()
} else {
return Err(Error::new(
ErrorKind::Other,
"Invalid range. max_missingness_rate must be between 0.0 and 1.0",
));
};
let pool_sizes = self.pool_sizes.clone();
return Ok(Box::new(FilterStats {
remove_ns,
remove_monoallelic,
keep_lowercase_reference,
max_base_error_rate,
min_coverage_depth,
min_coverage_breadth,
min_allele_frequency,
max_missingness_rate,
pool_sizes,
}));
}
}

1 change: 1 addition & 0 deletions src/base/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ mod pileup;
mod structs_and_traits;
mod sync;
mod vcf;
mod filter_stats;
14 changes: 7 additions & 7 deletions src/base/pileup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ impl Filter for PileupLine {
/// Filter `PileupLine` by:
/// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings
/// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// First, make sure we have the correct the correct number of expected pools as the phenotype file
// TODO: Make the error pop-out because it's currently being consumed as None in the only function calling it below.
if self.coverages.len() != filter_stats.pool_sizes.len() {
Expand Down Expand Up @@ -274,7 +274,7 @@ impl Filter for PileupLine {
.count();

if pools_covered != min_coverage_breadth as usize {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
Expand All @@ -292,7 +292,7 @@ impl Filter for PileupLine {
}
}
if unique_alleles.len() < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
}

Expand Down Expand Up @@ -357,9 +357,9 @@ impl Filter for PileupLine {
}
// Filter the whole locus depending on whether or not we have retained at least 2 alleles
if m < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
Ok(self)
Ok(Some(self))
}
}

Expand All @@ -368,8 +368,8 @@ pub fn pileup_to_sync(pileup_line: &mut PileupLine, filter_stats: &FilterStats)
// let mut pileup_line: Box<PileupLine> = line.lparse().unwrap();
// Filter
match pileup_line.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
// Convert to counts
let locus_counts = match pileup_line.to_counts() {
Expand Down
2 changes: 1 addition & 1 deletion src/base/structs_and_traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ pub trait Parse<T> {
pub trait Filter {
fn to_counts(&self) -> io::Result<Box<LocusCounts>>;
fn to_frequencies(&self) -> io::Result<Box<LocusFrequencies>>;
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self>;
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>>;
}

pub trait Sort {
Expand Down
27 changes: 14 additions & 13 deletions src/base/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ impl Filter for LocusCounts {
}

// Filter PileupLine by minimum coverage, minimum quality
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// Cannot filter by base qualities as this information is lost and we are assuming this has been performed during pileup to sync conversion
// Preliminary check of the structure format
self.check().unwrap();
Expand Down Expand Up @@ -225,7 +225,7 @@ impl Filter for LocusCounts {
.iter()
.fold(sum_coverage[0], |min, &x| if x < min { x } else { min });
if min_sum_coverage < filter_stats.min_coverage_depth as f64 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// // TODO: convert loci failing the minimum coverage threshold into missing instead of omitting the entire locus
// for i in 0..self.matrix.nrows() {
Expand Down Expand Up @@ -282,7 +282,7 @@ impl Filter for LocusCounts {
}
// Check if all alleles have failed the minimum allele frequency, i.e. the locus has been filtered out
if p < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter out if the locus is missing across all pools using the first allele where if the locus is missing then all
let (n, _p) = allele_frequencies.matrix.dim();
Expand All @@ -291,15 +291,15 @@ impl Filter for LocusCounts {
.slice(s![.., 0])
.fold(0, |sum, &x| if (x as f64).is_nan() { sum + 1 } else { sum });
if n_missing_across_pools == n {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold
if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Return the locus if it passed all the filtering steps
self.matrix = matrix;
Ok(self)
Ok(Some(self))
}
}

Expand Down Expand Up @@ -376,7 +376,7 @@ impl Filter for LocusFrequencies {
}

// Filter PileupLine by minimum coverage, minimum quality
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// Cannot filter by base qualities as this information is lost and we are assuming this has been performed during pileup to sync conversion
// Also, cannot filter by minimum coverage as that data is lost from counts to frequencies conversion
// Preliminary check of the structure format
Expand Down Expand Up @@ -453,7 +453,7 @@ impl Filter for LocusFrequencies {
}
// Check if all alleles have failed the minimum allele frequency, i.e. the locus has been filtered out
if p < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter out if the locus is missing across all pools using the first allele where if the locus is missing then all
let (n, _p) = allele_frequencies.matrix.dim();
Expand All @@ -462,15 +462,15 @@ impl Filter for LocusFrequencies {
.slice(s![.., 0])
.fold(0, |sum, &x| if (x as f64).is_nan() { sum + 1 } else { sum });
if n_missing_across_pools == n {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold
if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
// Return the locus if it passed all the filtering steps
self.matrix = matrix;
Ok(self)
Ok(Some(self))
}
}

Expand Down Expand Up @@ -1020,8 +1020,8 @@ impl LoadAll for FileSyncPhen {
},
};
match locus_counts.filter(filter_stats) {
Ok(x) => x,
Err(_) => continue,
Ok(Some(x)) => x,
_ => continue,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => *x,
Expand Down Expand Up @@ -1584,6 +1584,7 @@ mod tests {
.clone()
.filter(&filter_stats)
.unwrap()
.unwrap()
.to_frequencies()
.unwrap());
let mut sorted_filtered_frequencies = filtered_frequencies.clone();
Expand Down
18 changes: 9 additions & 9 deletions src/base/vcf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl Filter for VcfLine {
/// Filter `VcfLine` by:
/// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings
/// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> {
fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<Option<&mut Self>> {
// Coverage depth and breadth requirement
let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32;
let mut pools_covered = 0;
Expand All @@ -128,13 +128,13 @@ impl Filter for VcfLine {
}
}
if pools_covered != min_coverage_breadth {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}

// Remove monoallelic loci (each loci must have coverage of at least 2 alleles)
if filter_stats.remove_monoallelic {
if self.alternative_alleles.len() == 0 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
}

Expand Down Expand Up @@ -179,18 +179,18 @@ impl Filter for VcfLine {
}
// Filter the whole locus depending on whether or not we have retained at least 2 alleles
if m < 2 {
return Err(Error::new(ErrorKind::Other, "Filtered out."));
return Ok(None)
}
Ok(self)
Ok(Some(self))
}
}

/// Convert `VcfLine` into a string representing a line or locus in a `sync` file.
pub fn vcf_to_sync(vcf_line: &mut VcfLine, filter_stats: &FilterStats) -> Option<String> {
// Filter
match vcf_line.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
// Convert to counts
let locus_counts = match vcf_line.to_counts() {
Expand Down Expand Up @@ -557,8 +557,8 @@ mod tests {
);
filtered_vcf_1.filter(&filter_stats_1).unwrap();
let err = match filtered_vcf_2.filter(&filter_stats_2) {
Ok(_) => 0,
Err(_) => 1,
Ok(Some(_)) => 0,
_ => 1,
};
assert_eq!(filtered_vcf_1, vcf_line);
assert_eq!(err, 1);
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/correlation_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ pub fn correlation(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x.clone(),
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/gwalpha.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@ fn prepare_geno_and_pheno_stats(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/mle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,8 @@ pub fn mle_iterate(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/gwas/ols.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ pub fn ols_iterate(
.locus_counts
.filter(filter_stats)
{
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let mut locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
3 changes: 2 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ fn main() {
format: phen_format,
};
let phen = file_phen.lparse().unwrap();
let filter_stats = base::FilterStats {
let file_filter_stats = base::FilterStats {
remove_ns: !args.keep_ns,
remove_monoallelic: args.remove_monoallelic,
keep_lowercase_reference: args.keep_lowercase_reference,
Expand All @@ -204,6 +204,7 @@ fn main() {
max_missingness_rate: args.max_missingness_rate,
pool_sizes: phen.pool_sizes.clone(),
};
let filter_stats = file_filter_stats.lparse().unwrap();
if args.analysis == String::from("pileup2sync") {
// PILEUP INPUT
let file_pileup = base::FilePileup {
Expand Down
4 changes: 2 additions & 2 deletions src/tables/chisq_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ use statrs::distribution::{ChiSquared, ContinuousCDF};

pub fn chisq(locus_counts: &mut LocusCounts, filter_stats: &FilterStats) -> Option<String> {
let locus_counts = match locus_counts.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
let locus_frequencies = match locus_counts.to_frequencies() {
Ok(x) => x,
Expand Down
4 changes: 2 additions & 2 deletions src/tables/fisher_exact_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ fn hypergeom_ratio(counts: &Array2<f64>, log_prod_fac_marginal_sums: &f64) -> io

pub fn fisher(locus_counts: &mut LocusCounts, filter_stats: &FilterStats) -> Option<String> {
let locus_counts = match locus_counts.filter(filter_stats) {
Ok(x) => x,
Err(_) => return None,
Ok(Some(x)) => x,
_ => return None,
};
// Restrict so that the sum is less than or equal to 34, i.e. at n>34 : n! > f64::MAX
let n = locus_counts.matrix.nrows();
Expand Down