From cf7116d7773951c67b593411b003b596c6d0637f Mon Sep 17 00:00:00 2001 From: Cj Jiang Date: Sun, 14 Jul 2024 12:46:57 +1000 Subject: [PATCH 1/3] feat: argument bounds checking --- src/base/filter_stats.rs | 57 ++++++++++++++++++++++++++++++++++++++++ src/base/mod.rs | 1 + src/main.rs | 3 ++- 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 src/base/filter_stats.rs diff --git a/src/base/filter_stats.rs b/src/base/filter_stats.rs new file mode 100644 index 0000000..14f91e6 --- /dev/null +++ b/src/base/filter_stats.rs @@ -0,0 +1,57 @@ +use std::io::{self, Error, ErrorKind}; + +use crate::base::*; + +impl Parse for FilterStats { + fn lparse(&self) -> io::Result> { + let remove_ns = self.remove_ns.clone(); + let remove_monoallelic = self.remove_monoallelic.clone(); + let keep_lowercase_reference = self.keep_lowercase_reference.clone(); + let max_base_error_rate = if self.max_base_error_rate <= 1.0 && self.max_base_error_rate >= 0.0 { + self.max_base_error_rate.clone() + } else { + return Err(Error::new( + ErrorKind::Other, + "Invalid range. max_base_error_rate must be between 0.0 and 1.0", + )); + }; + let min_coverage_depth = self.min_coverage_depth; + let min_coverage_breadth = if self.min_coverage_breadth <= 1.0 && self.max_base_error_rate >= 0.0 { + self.min_coverage_breadth.clone() + } else { + return Err(Error::new( + ErrorKind::Other, + "Invalid range. min_coverage_breadth must be between 0.0 and 1.0", + )); + }; + let min_allele_frequency = if self.min_allele_frequency <= 1.0 && self.min_allele_frequency >= 0.0 { + self.min_allele_frequency.clone() + } else { + return Err(Error::new( + ErrorKind::Other, + "Invalid range. min_allele_frequency must be between 0.0 and 1.0", + )); + }; + let max_missingness_rate = if self.max_missingness_rate <= 1.0 && self.max_missingness_rate >= 0.0 { + self.max_missingness_rate.clone() + } else { + return Err(Error::new( + ErrorKind::Other, + "Invalid range. max_missingness_rate must be between 0.0 and 1.0", + )); + }; + let pool_sizes = self.pool_sizes.clone(); + return Ok(Box::new(FilterStats { + remove_ns, + remove_monoallelic, + keep_lowercase_reference, + max_base_error_rate, + min_coverage_depth, + min_coverage_breadth, + min_allele_frequency, + max_missingness_rate, + pool_sizes, + })); + } +} + diff --git a/src/base/mod.rs b/src/base/mod.rs index 069ad76..8c99ed7 100644 --- a/src/base/mod.rs +++ b/src/base/mod.rs @@ -6,3 +6,4 @@ mod pileup; mod structs_and_traits; mod sync; mod vcf; +mod filter_stats; diff --git a/src/main.rs b/src/main.rs index f0a4058..5674432 100644 --- a/src/main.rs +++ b/src/main.rs @@ -193,7 +193,7 @@ fn main() { format: phen_format, }; let phen = file_phen.lparse().unwrap(); - let filter_stats = base::FilterStats { + let file_filter_stats = base::FilterStats { remove_ns: !args.keep_ns, remove_monoallelic: args.remove_monoallelic, keep_lowercase_reference: args.keep_lowercase_reference, @@ -204,6 +204,7 @@ fn main() { max_missingness_rate: args.max_missingness_rate, pool_sizes: phen.pool_sizes.clone(), }; + let filter_stats = file_filter_stats.lparse().unwrap(); if args.analysis == String::from("pileup2sync") { // PILEUP INPUT let file_pileup = base::FilePileup { From 52c8806e18abd062c3cb4a41884f81e3b20e272d Mon Sep 17 00:00:00 2001 From: Cj Jiang Date: Sun, 14 Jul 2024 13:27:58 +1000 Subject: [PATCH 2/3] refactor: Option filtering rather than Errors --- src/base/pileup.rs | 14 +++++++------- src/base/structs_and_traits.rs | 2 +- src/base/sync.rs | 23 ++++++++++++----------- src/base/vcf.rs | 10 +++++----- src/gwas/correlation_test.rs | 4 ++-- src/gwas/gwalpha.rs | 4 ++-- src/gwas/mle.rs | 4 ++-- src/gwas/ols.rs | 4 ++-- src/tables/chisq_test.rs | 4 ++-- src/tables/fisher_exact_test.rs | 4 ++-- 10 files changed, 37 insertions(+), 36 deletions(-) diff --git a/src/base/pileup.rs b/src/base/pileup.rs index 0f4bd7b..2bdb83f 100644 --- a/src/base/pileup.rs +++ b/src/base/pileup.rs @@ -237,7 +237,7 @@ impl Filter for PileupLine { /// Filter `PileupLine` by: /// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings /// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed - fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> { + fn filter(&mut self, filter_stats: &FilterStats) -> io::Result> { // First, make sure we have the correct the correct number of expected pools as the phenotype file // TODO: Make the error pop-out because it's currently being consumed as None in the only function calling it below. if self.coverages.len() != filter_stats.pool_sizes.len() { @@ -274,7 +274,7 @@ impl Filter for PileupLine { .count(); if pools_covered != min_coverage_breadth as usize { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Remove monoallelic loci (each loci must have coverage of at least 2 alleles) @@ -292,7 +292,7 @@ impl Filter for PileupLine { } } if unique_alleles.len() < 2 { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } } @@ -357,9 +357,9 @@ impl Filter for PileupLine { } // Filter the whole locus depending on whether or not we have retained at least 2 alleles if m < 2 { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } - Ok(self) + Ok(Some(self)) } } @@ -368,8 +368,8 @@ pub fn pileup_to_sync(pileup_line: &mut PileupLine, filter_stats: &FilterStats) // let mut pileup_line: Box = line.lparse().unwrap(); // Filter match pileup_line.filter(filter_stats) { - Ok(x) => x, - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; // Convert to counts let locus_counts = match pileup_line.to_counts() { diff --git a/src/base/structs_and_traits.rs b/src/base/structs_and_traits.rs index 8c47953..9908df2 100644 --- a/src/base/structs_and_traits.rs +++ b/src/base/structs_and_traits.rs @@ -232,7 +232,7 @@ pub trait Parse { pub trait Filter { fn to_counts(&self) -> io::Result>; fn to_frequencies(&self) -> io::Result>; - fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self>; + fn filter(&mut self, filter_stats: &FilterStats) -> io::Result>; } pub trait Sort { diff --git a/src/base/sync.rs b/src/base/sync.rs index e66c7e7..38387dd 100644 --- a/src/base/sync.rs +++ b/src/base/sync.rs @@ -192,7 +192,7 @@ impl Filter for LocusCounts { } // Filter PileupLine by minimum coverage, minimum quality - fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> { + fn filter(&mut self, filter_stats: &FilterStats) -> io::Result> { // Cannot filter by base qualities as this information is lost and we are assuming this has been performed during pileup to sync conversion // Preliminary check of the structure format self.check().unwrap(); @@ -225,7 +225,7 @@ impl Filter for LocusCounts { .iter() .fold(sum_coverage[0], |min, &x| if x < min { x } else { min }); if min_sum_coverage < filter_stats.min_coverage_depth as f64 { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // // TODO: convert loci failing the minimum coverage threshold into missing instead of omitting the entire locus // for i in 0..self.matrix.nrows() { @@ -282,7 +282,7 @@ impl Filter for LocusCounts { } // Check if all alleles have failed the minimum allele frequency, i.e. the locus has been filtered out if p < 2 { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Filter out if the locus is missing across all pools using the first allele where if the locus is missing then all let (n, _p) = allele_frequencies.matrix.dim(); @@ -291,15 +291,15 @@ impl Filter for LocusCounts { .slice(s![.., 0]) .fold(0, |sum, &x| if (x as f64).is_nan() { sum + 1 } else { sum }); if n_missing_across_pools == n { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Return the locus if it passed all the filtering steps self.matrix = matrix; - Ok(self) + Ok(Some(self)) } } @@ -376,7 +376,7 @@ impl Filter for LocusFrequencies { } // Filter PileupLine by minimum coverage, minimum quality - fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> { + fn filter(&mut self, filter_stats: &FilterStats) -> io::Result> { // Cannot filter by base qualities as this information is lost and we are assuming this has been performed during pileup to sync conversion // Also, cannot filter by minimum coverage as that data is lost from counts to frequencies conversion // Preliminary check of the structure format @@ -453,7 +453,7 @@ impl Filter for LocusFrequencies { } // Check if all alleles have failed the minimum allele frequency, i.e. the locus has been filtered out if p < 2 { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Filter out if the locus is missing across all pools using the first allele where if the locus is missing then all let (n, _p) = allele_frequencies.matrix.dim(); @@ -462,15 +462,15 @@ impl Filter for LocusFrequencies { .slice(s![.., 0]) .fold(0, |sum, &x| if (x as f64).is_nan() { sum + 1 } else { sum }); if n_missing_across_pools == n { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Filter-out the locus if the rate of missingness, i.e. the fraction of the pools missing coverage of the current locus is below the minimum threshold if (n_missing_across_pools as f64 / n as f64) > filter_stats.max_missingness_rate { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Return the locus if it passed all the filtering steps self.matrix = matrix; - Ok(self) + Ok(Some(self)) } } @@ -1584,6 +1584,7 @@ mod tests { .clone() .filter(&filter_stats) .unwrap() + .unwrap() .to_frequencies() .unwrap()); let mut sorted_filtered_frequencies = filtered_frequencies.clone(); diff --git a/src/base/vcf.rs b/src/base/vcf.rs index a5cb182..b9929b0 100644 --- a/src/base/vcf.rs +++ b/src/base/vcf.rs @@ -114,7 +114,7 @@ impl Filter for VcfLine { /// Filter `VcfLine` by: /// - removing the entire locus if the locus is fixed, i.e. only 1 allele was found or retained after filterings /// Note that we are not removing alleles per locus if they fail the minimum allele frequency threshold, only if all alleles fail this threshold, i.e. when the locus is close to being fixed - fn filter(&mut self, filter_stats: &FilterStats) -> io::Result<&mut Self> { + fn filter(&mut self, filter_stats: &FilterStats) -> io::Result> { // Coverage depth and breadth requirement let min_coverage_breadth = (filter_stats.min_coverage_breadth * filter_stats.pool_sizes.len() as f64).ceil() as u32; let mut pools_covered = 0; @@ -128,13 +128,13 @@ impl Filter for VcfLine { } } if pools_covered != min_coverage_breadth { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } // Remove monoallelic loci (each loci must have coverage of at least 2 alleles) if filter_stats.remove_monoallelic { if self.alternative_alleles.len() == 0 { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } } @@ -179,9 +179,9 @@ impl Filter for VcfLine { } // Filter the whole locus depending on whether or not we have retained at least 2 alleles if m < 2 { - return Err(Error::new(ErrorKind::Other, "Filtered out.")); + return Ok(None) } - Ok(self) + Ok(Some(self)) } } diff --git a/src/gwas/correlation_test.rs b/src/gwas/correlation_test.rs index 3b4493f..6092a4c 100644 --- a/src/gwas/correlation_test.rs +++ b/src/gwas/correlation_test.rs @@ -81,8 +81,8 @@ pub fn correlation( .locus_counts .filter(filter_stats) { - Ok(x) => x.clone(), - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; let locus_frequencies = match locus_counts.to_frequencies() { Ok(x) => x, diff --git a/src/gwas/gwalpha.rs b/src/gwas/gwalpha.rs index 2e9af66..d6cefa7 100644 --- a/src/gwas/gwalpha.rs +++ b/src/gwas/gwalpha.rs @@ -177,8 +177,8 @@ fn prepare_geno_and_pheno_stats( .locus_counts .filter(filter_stats) { - Ok(x) => x, - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; let mut locus_frequencies = match locus_counts.to_frequencies() { Ok(x) => x, diff --git a/src/gwas/mle.rs b/src/gwas/mle.rs index 940e5c8..7aa06a2 100644 --- a/src/gwas/mle.rs +++ b/src/gwas/mle.rs @@ -240,8 +240,8 @@ pub fn mle_iterate( .locus_counts .filter(filter_stats) { - Ok(x) => x, - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; let mut locus_frequencies = match locus_counts.to_frequencies() { Ok(x) => x, diff --git a/src/gwas/ols.rs b/src/gwas/ols.rs index f1ddfb5..74afff2 100644 --- a/src/gwas/ols.rs +++ b/src/gwas/ols.rs @@ -211,8 +211,8 @@ pub fn ols_iterate( .locus_counts .filter(filter_stats) { - Ok(x) => x, - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; let mut locus_frequencies = match locus_counts.to_frequencies() { Ok(x) => x, diff --git a/src/tables/chisq_test.rs b/src/tables/chisq_test.rs index fe49bb3..921f170 100644 --- a/src/tables/chisq_test.rs +++ b/src/tables/chisq_test.rs @@ -4,8 +4,8 @@ use statrs::distribution::{ChiSquared, ContinuousCDF}; pub fn chisq(locus_counts: &mut LocusCounts, filter_stats: &FilterStats) -> Option { let locus_counts = match locus_counts.filter(filter_stats) { - Ok(x) => x, - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; let locus_frequencies = match locus_counts.to_frequencies() { Ok(x) => x, diff --git a/src/tables/fisher_exact_test.rs b/src/tables/fisher_exact_test.rs index 226f952..9a6e32d 100644 --- a/src/tables/fisher_exact_test.rs +++ b/src/tables/fisher_exact_test.rs @@ -31,8 +31,8 @@ fn hypergeom_ratio(counts: &Array2, log_prod_fac_marginal_sums: &f64) -> io pub fn fisher(locus_counts: &mut LocusCounts, filter_stats: &FilterStats) -> Option { let locus_counts = match locus_counts.filter(filter_stats) { - Ok(x) => x, - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; // Restrict so that the sum is less than or equal to 34, i.e. at n>34 : n! > f64::MAX let n = locus_counts.matrix.nrows(); From a8e1ada9a13cc4b6b12d945aa7d444ff05c74e5b Mon Sep 17 00:00:00 2001 From: Cj Jiang Date: Sun, 14 Jul 2024 14:27:06 +1000 Subject: [PATCH 3/3] fix: option filtering --- src/base/sync.rs | 4 ++-- src/base/vcf.rs | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/base/sync.rs b/src/base/sync.rs index 38387dd..76f137b 100644 --- a/src/base/sync.rs +++ b/src/base/sync.rs @@ -1020,8 +1020,8 @@ impl LoadAll for FileSyncPhen { }, }; match locus_counts.filter(filter_stats) { - Ok(x) => x, - Err(_) => continue, + Ok(Some(x)) => x, + _ => continue, }; let mut locus_frequencies = match locus_counts.to_frequencies() { Ok(x) => *x, diff --git a/src/base/vcf.rs b/src/base/vcf.rs index b9929b0..a92ea2c 100644 --- a/src/base/vcf.rs +++ b/src/base/vcf.rs @@ -189,8 +189,8 @@ impl Filter for VcfLine { pub fn vcf_to_sync(vcf_line: &mut VcfLine, filter_stats: &FilterStats) -> Option { // Filter match vcf_line.filter(filter_stats) { - Ok(x) => x, - Err(_) => return None, + Ok(Some(x)) => x, + _ => return None, }; // Convert to counts let locus_counts = match vcf_line.to_counts() { @@ -557,8 +557,8 @@ mod tests { ); filtered_vcf_1.filter(&filter_stats_1).unwrap(); let err = match filtered_vcf_2.filter(&filter_stats_2) { - Ok(_) => 0, - Err(_) => 1, + Ok(Some(_)) => 0, + _ => 1, }; assert_eq!(filtered_vcf_1, vcf_line); assert_eq!(err, 1);