-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fq trimming #37
Open
brentp
wants to merge
28
commits into
main
Choose a base branch
from
trimmer
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
fq trimming #37
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit
Hold shift + click to select a range
150c4be
WIP: fq stuff
brentp 0957ea3
change max_diffs to max_overlap_error_rate
brentp 73f78b8
renaming
brentp e2bdcfa
start of CLI stuff.
brentp f48a5c3
we have readers and writers
brentp 476ebfc
reading and writing fastqs
brentp fe8af74
actually correct based on overlap
brentp 9d54bec
moving average and base-quality
brentp 884dca3
match input output names to Tim's example
brentp 5efdf45
parsing of operations enum
brentp a164b37
main execution loop
brentp a4083b7
copy Tim's oscillation detection from python to rust
brentp 38f077c
more efficient identify_trim_point for oscillations
brentp bb4a10f
prevent some underflows
brentp d56a65a
stub out stats
brentp 1ea71c0
cleanup stats
brentp a68e6e3
try to clarify option usage.
brentp e21631a
more stats
brentp fc58ca1
bug fixes and read length stats
brentp 5b3043e
naming and length hist update
brentp 6201ea1
add Cargo.lock
brentp 7a2e646
logging in pair overlap
brentp 987d34e
warn that output files are always bgzipped
brentp 7263c5c
fix erroneous warning about .gz extension
brentp c351467
clippy: PathBuf -> Path
brentp 62258d7
fix case where there is no overhang from R1
brentp 2ee4d93
handle no overhang in r1 with adapter in r2
brentp 382e2a6
Io::is_gzip_path
brentp File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
pub mod command; | ||
pub mod demux; | ||
pub mod trimmer; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,293 @@ | ||
use crate::commands::command::Command; | ||
use anyhow::{Error, Result}; | ||
use clap::{Parser, ValueEnum}; | ||
use fgoxide::io::Io; | ||
use fqtk_lib::fastq_stats as stats; | ||
use fqtk_lib::{base_quality, pair_overlap}; | ||
use log::info; | ||
use pooled_writer::{bgzf::BgzfCompressor, Pool, PoolBuilder, PooledWriter}; | ||
use seq_io::fastq::{Reader as FastqReader, Record}; | ||
use std::fs::File; | ||
use std::io::{BufRead, BufWriter, Write}; | ||
use std::path::Path; | ||
use std::path::PathBuf; | ||
|
||
#[derive(ValueEnum, Clone, Debug)] | ||
enum Operation { | ||
Clip, | ||
Overlap, | ||
Osc, | ||
FilterLen, | ||
} | ||
|
||
/// Trimming and overlap correction of paired-end reads | ||
#[derive(Parser, Debug)] | ||
#[command(version)] | ||
pub(crate) struct TrimmerOpts { | ||
/// Reading/Writing threads | ||
#[clap(long, short = 't', default_value = "5")] | ||
threads: usize, | ||
|
||
/// Clip bases with quality < this value. | ||
#[clap(long, short = 'q', default_value = "20")] | ||
clip_tail_quality: u8, | ||
|
||
/// Which tail(s) to clip. | ||
#[clap(long, value_enum, default_value = "end")] | ||
clip_tail_side: base_quality::Tail, | ||
|
||
/// Window size for moving average when clipping tails. | ||
#[clap(long, short = 'w', default_value = "20")] | ||
clip_tail_window: u8, | ||
|
||
/// Level of compression to use to compress outputs. | ||
#[clap(long, short = 'c', default_value = "5")] | ||
compression_level: usize, | ||
|
||
/// Length requirement of shorter read. Lengths below this are clipped. | ||
#[clap(long, short = 'S', default_value = "5")] | ||
filter_shorter: usize, | ||
|
||
/// Length requirement of longer read. Lengths below this are clipped. | ||
#[clap(long, short = 'L', default_value = "15")] | ||
filter_longer: usize, | ||
|
||
/// Size of window to look for oscillations. | ||
#[clap(long, default_value = "15")] | ||
osc_window: usize, | ||
|
||
/// Required number of oscillations in a window to trigger trimming/masking. | ||
#[clap(long, default_value = "4")] | ||
osc_max_oscillations: usize, | ||
|
||
/// Difference between adjacent bases to be considered on oscillation. | ||
#[clap(long, default_value = "10")] | ||
osc_delta: usize, | ||
|
||
/// Minimum difference in base-quality for one read to correct an overlapping | ||
/// base from the other read. | ||
#[clap(long, short = 'd', default_value = "15")] | ||
overlap_min_bq_delta: u8, | ||
|
||
/// Minimum pair overlap length to attempt correction. | ||
#[clap(long, short = 'l', default_value = "50")] | ||
overlap_min_length: usize, | ||
|
||
/// Maximum error-rate allowed in the overlap. | ||
#[clap(long, short = 'e', default_value = "0.02")] | ||
overlap_max_error_rate: f64, | ||
|
||
/// Hard clip adapter sequences from the reads detected in overlap module. | ||
/// If this is not specified, the adapter qualities are instead set to `mask_quality` | ||
#[clap(long, default_value_t = false)] | ||
overlap_hard_clip_adapters: bool, | ||
|
||
/// Quality value to use as a mask (should be lower than `clip_tail_quality`) | ||
#[clap(long, default_value = "0")] | ||
mask_quality: u8, | ||
|
||
/// Order of operations | ||
#[clap(value_enum, short = 'p', default_value = "overlap", num_args = 1..)] | ||
operations: Vec<Operation>, | ||
|
||
/// The paths for the 2 output FASTQs. | ||
#[clap(long, short = 'o', required = true, num_args = 2)] | ||
output: Vec<PathBuf>, | ||
|
||
/// Fastqs file for Read1 and Read2 | ||
#[clap(long, short = 'i', required = true, num_args = 2)] | ||
input: Vec<PathBuf>, | ||
} | ||
|
||
const BUFFER_SIZE: usize = 1024 * 1024; | ||
|
||
/// Type alias to prevent clippy complaining about type complexity | ||
type VecOfReaders = Vec<Box<dyn BufRead + Send>>; | ||
type VecOfFqReaders = Vec<FastqReader<Box<dyn BufRead + Send>>>; | ||
|
||
fn create_writer<P: AsRef<Path>>(name: P) -> Result<BufWriter<File>, Error> { | ||
Ok(BufWriter::new(File::create(name)?)) | ||
} | ||
|
||
fn check_extension(p: &Path) -> bool { | ||
let ext = p.extension().map_or("", |v| v.to_str().unwrap_or("")); | ||
["bgz", "gz"].contains(&ext) | ||
} | ||
|
||
impl TrimmerOpts { | ||
fn prepare(&self) -> Result<(Pool, Vec<PooledWriter>, VecOfFqReaders), Error> { | ||
let fgio = Io::new(5, BUFFER_SIZE); | ||
let fq_readers = self | ||
.input | ||
.iter() | ||
.map(|p| fgio.new_reader(p)) | ||
.collect::<Result<VecOfReaders, fgoxide::FgError>>()?; | ||
|
||
let fq_readers = | ||
fq_readers.into_iter().map(|fq| FastqReader::with_capacity(fq, BUFFER_SIZE)).collect(); | ||
|
||
let output: Vec<_> = self | ||
.output | ||
.iter() | ||
.map(|p| { | ||
if !check_extension(p) { | ||
log::warn!( | ||
"Output file {} does not end with .gz or .bgz. Writing bgzipped output to {} instead.", | ||
p.display(), | ||
p.with_extension("gz").display() | ||
); | ||
p.with_extension("gz") | ||
} else { | ||
p.clone() | ||
} | ||
}) | ||
.collect(); | ||
|
||
let writers = vec![create_writer(&output[0])?, create_writer(&output[1])?]; | ||
|
||
let mut pool_builder = PoolBuilder::<_, BgzfCompressor>::new() | ||
.threads(self.threads) | ||
.queue_size(self.threads * 50) | ||
.compression_level(u8::try_from(self.compression_level)?)?; | ||
|
||
let pooled_writers = | ||
writers.into_iter().map(|w| pool_builder.exchange(w)).collect::<Vec<_>>(); | ||
|
||
let pool = pool_builder.build()?; | ||
|
||
Ok((pool, pooled_writers, fq_readers)) | ||
} | ||
} | ||
|
||
impl Command for TrimmerOpts { | ||
fn execute(&self) -> Result<()> { | ||
let (mut pool, mut writers, mut readers) = self.prepare()?; | ||
let f1 = readers.remove(0); | ||
let f2 = readers.remove(0); | ||
|
||
let mut stats = stats::Stats::new(); | ||
|
||
'pair: for (r1, r2) in f1.into_records().zip(f2.into_records()) { | ||
let mut r1 = r1?; | ||
let mut r2 = r2?; | ||
|
||
stats.update_length(r1.seq.len(), stats::When::Pre, stats::ReadI::R1); | ||
stats.update_length(r2.seq.len(), stats::When::Pre, stats::ReadI::R2); | ||
|
||
for operation in &self.operations { | ||
match operation { | ||
Operation::Clip => { | ||
for r in [&mut r1, &mut r2].iter_mut() { | ||
let hq_range = base_quality::find_high_quality_bases( | ||
r.qual(), | ||
self.clip_tail_quality, | ||
self.clip_tail_window, | ||
self.clip_tail_side, | ||
); | ||
// this is hard clip so we send None | ||
base_quality::mask_read(r, hq_range, None); | ||
} | ||
} | ||
Operation::Overlap => { | ||
if let Some(overlap) = pair_overlap::find_overlap( | ||
r1.seq(), | ||
r2.seq(), | ||
self.overlap_min_length, | ||
self.overlap_max_error_rate, | ||
) { | ||
log::debug!( | ||
"found overlap in pair: {} shift: {}, overlap: {}, adapter: {}", | ||
r1.id().unwrap_or("read"), | ||
overlap.shift, | ||
overlap.overlap, | ||
overlap.adapter | ||
); | ||
stats.overlap_stats.update(overlap); | ||
let corrections = overlap.correct( | ||
&mut r1, | ||
&mut r2, | ||
self.overlap_min_bq_delta, | ||
if self.overlap_hard_clip_adapters { | ||
None | ||
} else { | ||
Some(self.mask_quality) | ||
}, | ||
); | ||
log::debug!("corrections: {:?}", corrections); | ||
stats.overlap_stats.update_corrections(corrections.0, stats::ReadI::R1); | ||
stats.overlap_stats.update_corrections(corrections.1, stats::ReadI::R2); | ||
} | ||
} | ||
Operation::Osc => { | ||
if let Some(i) = base_quality::identify_trim_point( | ||
r1.qual(), | ||
self.osc_delta as i32, | ||
self.osc_window, | ||
self.osc_max_oscillations, | ||
) { | ||
base_quality::mask_read(&mut r1, 0usize..i, Some(self.mask_quality)); | ||
stats.update_oscillations(1, stats::ReadI::R1); | ||
} | ||
if let Some(i) = base_quality::identify_trim_point( | ||
r2.qual(), | ||
self.osc_delta as i32, | ||
self.osc_window, | ||
self.osc_max_oscillations, | ||
) { | ||
base_quality::mask_read(&mut r2, 0usize..i, Some(self.mask_quality)); | ||
stats.update_oscillations(1, stats::ReadI::R2); | ||
} | ||
} | ||
Operation::FilterLen => { | ||
if r1.seq().len().min(r2.seq().len()) < self.filter_shorter | ||
|| r1.seq().len().max(r2.seq().len()) < self.filter_longer | ||
{ | ||
info!("Skipping pair with short read"); | ||
stats.increment_length_filter(); | ||
continue 'pair; | ||
} | ||
} | ||
} | ||
} | ||
|
||
stats.update_length(r1.seq.len(), stats::When::Post, stats::ReadI::R1); | ||
stats.update_length(r2.seq.len(), stats::When::Post, stats::ReadI::R2); | ||
|
||
r1.write(&mut writers[0])?; | ||
r2.write(&mut writers[1])?; | ||
} | ||
|
||
writeln!(std::io::stderr(), "{}", stats)?; | ||
|
||
writers.into_iter().try_for_each(|w| w.close())?; | ||
pool.stop_pool()?; | ||
|
||
Ok(()) | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use std::path::PathBuf; | ||
|
||
#[test] | ||
fn test_check_extension_gz() { | ||
let path = PathBuf::from("test_file.gz"); | ||
assert_eq!( | ||
check_extension(&path), | ||
true, | ||
"The function should return true for .gz extension" | ||
); | ||
} | ||
|
||
#[test] | ||
fn test_check_extension_txt() { | ||
let path = PathBuf::from("test_file.txt"); | ||
assert_eq!( | ||
check_extension(&path), | ||
false, | ||
"The function should return false for .txt extension" | ||
); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cna we use: https://github.com/fulcrumgenomics/fgoxide/blob/e3a3068c451d8a06491e499075032e2075c41e7c/src/io/mod.rs#L56
Or better yet:
https://github.com/fulcrumgenomics/fgoxide/blob/e3a3068c451d8a06491e499075032e2075c41e7c/src/io/mod.rs#L82C8-L82C20
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
see: fulcrumgenomics/fgoxide#11