Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fq trimming #37

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
150c4be
WIP: fq stuff
brentp Aug 16, 2023
0957ea3
change max_diffs to max_overlap_error_rate
brentp Aug 16, 2023
73f78b8
renaming
brentp Aug 16, 2023
e2bdcfa
start of CLI stuff.
brentp Aug 17, 2023
f48a5c3
we have readers and writers
brentp Aug 17, 2023
476ebfc
reading and writing fastqs
brentp Aug 17, 2023
fe8af74
actually correct based on overlap
brentp Aug 17, 2023
9d54bec
moving average and base-quality
brentp Aug 18, 2023
884dca3
match input output names to Tim's example
brentp Aug 18, 2023
5efdf45
parsing of operations enum
brentp Aug 18, 2023
a164b37
main execution loop
brentp Aug 18, 2023
a4083b7
copy Tim's oscillation detection from python to rust
brentp Aug 21, 2023
38f077c
more efficient identify_trim_point for oscillations
brentp Aug 21, 2023
bb4a10f
prevent some underflows
brentp Aug 21, 2023
d56a65a
stub out stats
brentp Aug 22, 2023
1ea71c0
cleanup stats
brentp Aug 23, 2023
a68e6e3
try to clarify option usage.
brentp Aug 23, 2023
e21631a
more stats
brentp Aug 23, 2023
fc58ca1
bug fixes and read length stats
brentp Aug 23, 2023
5b3043e
naming and length hist update
brentp Aug 23, 2023
6201ea1
add Cargo.lock
brentp Aug 24, 2023
7a2e646
logging in pair overlap
brentp Aug 24, 2023
987d34e
warn that output files are always bgzipped
brentp Aug 29, 2023
7263c5c
fix erroneous warning about .gz extension
brentp Oct 20, 2023
c351467
clippy: PathBuf -> Path
brentp Oct 23, 2023
62258d7
fix case where there is no overhang from R1
brentp Oct 23, 2023
2ee4d93
handle no overhang in r1 with adapter in r2
brentp Oct 23, 2023
382e2a6
Io::is_gzip_path
brentp Oct 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
305 changes: 204 additions & 101 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ path = "src/bin/main.rs"
ahash = "0.8.2"
anyhow = "1.0.38"
bstr = "1.0.1"
clap = { version = "4.0.25", features = ["derive"] }
clap = { version = "4.3.22", features = ["derive"] }
enum_dispatch = "0.3.8"
env_logger = "0.9.3"
fgoxide = "0.3.0"
Expand Down
1 change: 1 addition & 0 deletions src/bin/commands/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod command;
pub mod demux;
pub mod trimmer;
293 changes: 293 additions & 0 deletions src/bin/commands/trimmer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
use crate::commands::command::Command;
use anyhow::{Error, Result};
use clap::{Parser, ValueEnum};
use fgoxide::io::Io;
use fqtk_lib::fastq_stats as stats;
use fqtk_lib::{base_quality, pair_overlap};
use log::info;
use pooled_writer::{bgzf::BgzfCompressor, Pool, PoolBuilder, PooledWriter};
use seq_io::fastq::{Reader as FastqReader, Record};
use std::fs::File;
use std::io::{BufRead, BufWriter, Write};
use std::path::Path;
use std::path::PathBuf;

#[derive(ValueEnum, Clone, Debug)]
enum Operation {
Clip,
Overlap,
Osc,
FilterLen,
}

/// Trimming and overlap correction of paired-end reads
#[derive(Parser, Debug)]
#[command(version)]
pub(crate) struct TrimmerOpts {
/// Reading/Writing threads
#[clap(long, short = 't', default_value = "5")]
threads: usize,

/// Clip bases with quality < this value.
#[clap(long, short = 'q', default_value = "20")]
clip_tail_quality: u8,

/// Which tail(s) to clip.
#[clap(long, value_enum, default_value = "end")]
clip_tail_side: base_quality::Tail,

/// Window size for moving average when clipping tails.
#[clap(long, short = 'w', default_value = "20")]
clip_tail_window: u8,

/// Level of compression to use to compress outputs.
#[clap(long, short = 'c', default_value = "5")]
compression_level: usize,

/// Length requirement of shorter read. Lengths below this are clipped.
#[clap(long, short = 'S', default_value = "5")]
filter_shorter: usize,

/// Length requirement of longer read. Lengths below this are clipped.
#[clap(long, short = 'L', default_value = "15")]
filter_longer: usize,

/// Size of window to look for oscillations.
#[clap(long, default_value = "15")]
osc_window: usize,

/// Required number of oscillations in a window to trigger trimming/masking.
#[clap(long, default_value = "4")]
osc_max_oscillations: usize,

/// Difference between adjacent bases to be considered on oscillation.
#[clap(long, default_value = "10")]
osc_delta: usize,

/// Minimum difference in base-quality for one read to correct an overlapping
/// base from the other read.
#[clap(long, short = 'd', default_value = "15")]
overlap_min_bq_delta: u8,

/// Minimum pair overlap length to attempt correction.
#[clap(long, short = 'l', default_value = "50")]
overlap_min_length: usize,

/// Maximum error-rate allowed in the overlap.
#[clap(long, short = 'e', default_value = "0.02")]
overlap_max_error_rate: f64,

/// Hard clip adapter sequences from the reads detected in overlap module.
/// If this is not specified, the adapter qualities are instead set to `mask_quality`
#[clap(long, default_value_t = false)]
overlap_hard_clip_adapters: bool,

/// Quality value to use as a mask (should be lower than `clip_tail_quality`)
#[clap(long, default_value = "0")]
mask_quality: u8,

/// Order of operations
#[clap(value_enum, short = 'p', default_value = "overlap", num_args = 1..)]
operations: Vec<Operation>,

/// The paths for the 2 output FASTQs.
#[clap(long, short = 'o', required = true, num_args = 2)]
output: Vec<PathBuf>,

/// Fastqs file for Read1 and Read2
#[clap(long, short = 'i', required = true, num_args = 2)]
input: Vec<PathBuf>,
}

const BUFFER_SIZE: usize = 1024 * 1024;

/// Type alias to prevent clippy complaining about type complexity
type VecOfReaders = Vec<Box<dyn BufRead + Send>>;
type VecOfFqReaders = Vec<FastqReader<Box<dyn BufRead + Send>>>;

fn create_writer<P: AsRef<Path>>(name: P) -> Result<BufWriter<File>, Error> {
Ok(BufWriter::new(File::create(name)?))
}

fn check_extension(p: &Path) -> bool {
let ext = p.extension().map_or("", |v| v.to_str().unwrap_or(""));
["bgz", "gz"].contains(&ext)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

impl TrimmerOpts {
fn prepare(&self) -> Result<(Pool, Vec<PooledWriter>, VecOfFqReaders), Error> {
let fgio = Io::new(5, BUFFER_SIZE);
let fq_readers = self
.input
.iter()
.map(|p| fgio.new_reader(p))
.collect::<Result<VecOfReaders, fgoxide::FgError>>()?;

let fq_readers =
fq_readers.into_iter().map(|fq| FastqReader::with_capacity(fq, BUFFER_SIZE)).collect();

let output: Vec<_> = self
.output
.iter()
.map(|p| {
if !check_extension(p) {
log::warn!(
"Output file {} does not end with .gz or .bgz. Writing bgzipped output to {} instead.",
p.display(),
p.with_extension("gz").display()
);
p.with_extension("gz")
} else {
p.clone()
}
})
.collect();

let writers = vec![create_writer(&output[0])?, create_writer(&output[1])?];

let mut pool_builder = PoolBuilder::<_, BgzfCompressor>::new()
.threads(self.threads)
.queue_size(self.threads * 50)
.compression_level(u8::try_from(self.compression_level)?)?;

let pooled_writers =
writers.into_iter().map(|w| pool_builder.exchange(w)).collect::<Vec<_>>();

let pool = pool_builder.build()?;

Ok((pool, pooled_writers, fq_readers))
}
}

impl Command for TrimmerOpts {
fn execute(&self) -> Result<()> {
let (mut pool, mut writers, mut readers) = self.prepare()?;
let f1 = readers.remove(0);
let f2 = readers.remove(0);

let mut stats = stats::Stats::new();

'pair: for (r1, r2) in f1.into_records().zip(f2.into_records()) {
let mut r1 = r1?;
let mut r2 = r2?;

stats.update_length(r1.seq.len(), stats::When::Pre, stats::ReadI::R1);
stats.update_length(r2.seq.len(), stats::When::Pre, stats::ReadI::R2);

for operation in &self.operations {
match operation {
Operation::Clip => {
for r in [&mut r1, &mut r2].iter_mut() {
let hq_range = base_quality::find_high_quality_bases(
r.qual(),
self.clip_tail_quality,
self.clip_tail_window,
self.clip_tail_side,
);
// this is hard clip so we send None
base_quality::mask_read(r, hq_range, None);
}
}
Operation::Overlap => {
if let Some(overlap) = pair_overlap::find_overlap(
r1.seq(),
r2.seq(),
self.overlap_min_length,
self.overlap_max_error_rate,
) {
log::debug!(
"found overlap in pair: {} shift: {}, overlap: {}, adapter: {}",
r1.id().unwrap_or("read"),
overlap.shift,
overlap.overlap,
overlap.adapter
);
stats.overlap_stats.update(overlap);
let corrections = overlap.correct(
&mut r1,
&mut r2,
self.overlap_min_bq_delta,
if self.overlap_hard_clip_adapters {
None
} else {
Some(self.mask_quality)
},
);
log::debug!("corrections: {:?}", corrections);
stats.overlap_stats.update_corrections(corrections.0, stats::ReadI::R1);
stats.overlap_stats.update_corrections(corrections.1, stats::ReadI::R2);
}
}
Operation::Osc => {
if let Some(i) = base_quality::identify_trim_point(
r1.qual(),
self.osc_delta as i32,
self.osc_window,
self.osc_max_oscillations,
) {
base_quality::mask_read(&mut r1, 0usize..i, Some(self.mask_quality));
stats.update_oscillations(1, stats::ReadI::R1);
}
if let Some(i) = base_quality::identify_trim_point(
r2.qual(),
self.osc_delta as i32,
self.osc_window,
self.osc_max_oscillations,
) {
base_quality::mask_read(&mut r2, 0usize..i, Some(self.mask_quality));
stats.update_oscillations(1, stats::ReadI::R2);
}
}
Operation::FilterLen => {
if r1.seq().len().min(r2.seq().len()) < self.filter_shorter
|| r1.seq().len().max(r2.seq().len()) < self.filter_longer
{
info!("Skipping pair with short read");
stats.increment_length_filter();
continue 'pair;
}
}
}
}

stats.update_length(r1.seq.len(), stats::When::Post, stats::ReadI::R1);
stats.update_length(r2.seq.len(), stats::When::Post, stats::ReadI::R2);

r1.write(&mut writers[0])?;
r2.write(&mut writers[1])?;
}

writeln!(std::io::stderr(), "{}", stats)?;

writers.into_iter().try_for_each(|w| w.close())?;
pool.stop_pool()?;

Ok(())
}
}

#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;

#[test]
fn test_check_extension_gz() {
let path = PathBuf::from("test_file.gz");
assert_eq!(
check_extension(&path),
true,
"The function should return true for .gz extension"
);
}

#[test]
fn test_check_extension_txt() {
let path = PathBuf::from("test_file.txt");
assert_eq!(
check_extension(&path),
false,
"The function should return false for .txt extension"
);
}
}
3 changes: 2 additions & 1 deletion src/bin/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ pub mod commands;
use anyhow::Result;
use clap::Parser;
use commands::command::Command;
use commands::demux::Demux;
use commands::{demux::Demux, trimmer::TrimmerOpts};
use enum_dispatch::enum_dispatch;
use env_logger::Env;

Expand All @@ -23,6 +23,7 @@ struct Args {
#[command(version)]
enum Subcommand {
Demux(Demux),
Trimmer(TrimmerOpts),
}

fn main() -> Result<()> {
Expand Down
Loading
Loading