From 56442e4bc0b99ba72bacc8458a761a32ea0b0153 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Tue, 7 May 2024 15:03:42 +0200 Subject: [PATCH 01/15] If multiple output files need to be written, I obviously can't use the max_threads for each. --- README.md | 8 +++---- src/auxiliary.rs | 56 ++++++++++++++++++++++++++++++++++++++++++++- src/umi_external.rs | 17 +++++++------- 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 55e6d75..8292e3c 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,12 @@
-[![License:MIT](https://img.shields.io/badge/License-MIT-491f53.svg)](https://opensource.org/licenses/MIT) +[![License: MIT](https://img.shields.io/badge/License-MIT-491f53.svg)](https://opensource.org/licenses/MIT) ![GitHub Actions Tests](https://img.shields.io/github/actions/workflow/status/SciLifeLab/umi-transfer/.github%2Fworkflows%2Ftesting.yml?branch=dev&logo=github&label=Tests&color=%23a7c947) [![codecov](https://codecov.io/gh/SciLifeLab/umi-transfer/branch/dev/graph/badge.svg)](https://codecov.io/gh/SciLifeLab/umi-transfer) -![GitHub Actions Build](https://img.shields.io/github/actions/workflow/status/SciLifeLab/umi-transfer/.github%2Fworkflows%2Frelease.yml?branch=dev&label=Binary%20builds&logo=github&color=%23a7c947) -[![GitHub Actions Build](https://img.shields.io/github/actions/workflow/status/SciLifeLab/umi-transfer/.github%2Fworkflows%2Fcontainer.yml?branch=dev&label=Docker%20builds&logo=docker&color=%23a7c947)](https://hub.docker.com/r/mzscilifelab/umi-transfer) -[![install with Bioconda](https://img.shields.io/badge/Available%20via-Bioconda-045c64.svg)](https://bioconda.github.io/recipes/umi-transfer/README.html) +[![Build status](https://img.shields.io/github/actions/workflow/status/SciLifeLab/umi-transfer/.github%2Fworkflows%2Frelease.yml?branch=dev&label=Binary%20builds&logo=github&color=%23a7c947)](https://github.com/SciLifeLab/umi-transfer/releases/latest) +[![Docker container status](https://img.shields.io/github/actions/workflow/status/SciLifeLab/umi-transfer/.github%2Fworkflows%2Fcontainer.yml?branch=dev&label=Docker%20builds&logo=docker&color=%23a7c947)](https://hub.docker.com/r/mzscilifelab/umi-transfer) +[![Install with Bioconda](https://img.shields.io/badge/Available%20via-Bioconda-045c64.svg)](https://bioconda.github.io/recipes/umi-transfer/README.html) ## Background diff --git a/src/auxiliary.rs b/src/auxiliary.rs index 81d9099..a1a1d9e 100644 --- a/src/auxiliary.rs +++ b/src/auxiliary.rs @@ -1,4 +1,4 @@ -use std::time::Instant; +use std::{thread,time::Instant}; pub fn timedrun(msg: &str, func: F) -> R where @@ -9,3 +9,57 @@ where println!("{msg} after {:.1} seconds", start.elapsed().as_secs_f32()); measure } + + +pub fn threads_available() -> usize { + thread::available_parallelism() + .map(|cores| cores.get()) + .unwrap_or_else(|_| { + eprintln!( + "Failed to determine number of available threads. Please specify manually with --threads." + ); 1}) +} + +pub fn threads_per_task(available_threads: usize, num_tasks: usize) -> usize { + if available_threads <= 1 || available_threads <= num_tasks { + 1 + } else { + // Subtract 1 for the main thread + let threads_for_tasks = available_threads - 1; + // The result is already always rounded down towards zero for integer divisions using the / operator. + let threads_per_task = threads_for_tasks / num_tasks; + threads_per_task.max(1) + } +} + + + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_threads_available_returns_positive_number() { + let threads = threads_available(); + assert!(threads > 0); + } + + #[test] + fn test_threads_per_task_never_returns_less_than_one() { + let threads_per_task = threads_per_task(1,3); + assert!(threads_per_task == 1); + } + + #[test] + fn test_threads_per_task_splits_even_threads_correctly() { + let threads_per_task = threads_per_task(8,3); + assert!(threads_per_task == 2); + } + + #[test] + fn test_threads_per_task_splits_odd_threads_correctly() { + let threads_per_task = threads_per_task(10,3); + assert!(threads_per_task == 3); + } +} diff --git a/src/umi_external.rs b/src/umi_external.rs index f902c5b..8c43e61 100644 --- a/src/umi_external.rs +++ b/src/umi_external.rs @@ -1,9 +1,10 @@ use anyhow::{anyhow, Context, Result}; use clap::Parser; use itertools::izip; -use std::{path::PathBuf, thread}; +use std::path::PathBuf; use super::file_io; +use crate::auxiliary::{threads_available,threads_per_task}; use crate::umi_errors::RuntimeErrors; #[derive(Debug, Parser)] pub struct OptsExternal { @@ -101,14 +102,12 @@ pub fn run(args: OptsExternal) -> Result { // Set the number of threads to max, unless manually specified. In case of failure, use only 1. let num_threads = args.num_threads.unwrap_or_else(|| { - thread::available_parallelism() - .map(|cores| cores.get()) - .unwrap_or_else(|_| { - eprintln!( - "Failed to determine number of available threads. Please specify manually with --threads." - ); 1}) + threads_available() }); + // Determine the number of threads available for output file compression. + let threads_per_task = threads_per_task(num_threads, 2); + // Read FastQ records from input files let r1 = file_io::read_fastq(&args.r1_in) .with_context(|| { @@ -157,14 +156,14 @@ pub fn run(args: OptsExternal) -> Result { let mut write_output_r1 = file_io::create_writer( output1, &args.gzip, - &num_threads, + &threads_per_task, &args.compression_level, None, )?; let mut write_output_r2 = file_io::create_writer( output2, &args.gzip, - &num_threads, + &threads_per_task, &args.compression_level, None, )?; From 764fe3819838f5d8a20d722eed43b6817f525949 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 22 May 2024 15:39:20 +0200 Subject: [PATCH 02/15] Alias bio::io::fastq::Reader as FastqReader analogous to FastqWriter. --- src/auxiliary.rs | 15 ++++++--------- src/file_io.rs | 4 ++-- src/umi_external.rs | 6 ++---- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/auxiliary.rs b/src/auxiliary.rs index a1a1d9e..5fdeb11 100644 --- a/src/auxiliary.rs +++ b/src/auxiliary.rs @@ -1,4 +1,4 @@ -use std::{thread,time::Instant}; +use std::{thread, time::Instant}; pub fn timedrun(msg: &str, func: F) -> R where @@ -10,7 +10,6 @@ where measure } - pub fn threads_available() -> usize { thread::available_parallelism() .map(|cores| cores.get()) @@ -25,20 +24,18 @@ pub fn threads_per_task(available_threads: usize, num_tasks: usize) -> usize { 1 } else { // Subtract 1 for the main thread - let threads_for_tasks = available_threads - 1; + let threads_for_tasks = available_threads - 1; // The result is already always rounded down towards zero for integer divisions using the / operator. let threads_per_task = threads_for_tasks / num_tasks; threads_per_task.max(1) } } - - #[cfg(test)] mod tests { use super::*; - + #[test] fn test_threads_available_returns_positive_number() { let threads = threads_available(); @@ -47,19 +44,19 @@ mod tests { #[test] fn test_threads_per_task_never_returns_less_than_one() { - let threads_per_task = threads_per_task(1,3); + let threads_per_task = threads_per_task(1, 3); assert!(threads_per_task == 1); } #[test] fn test_threads_per_task_splits_even_threads_correctly() { - let threads_per_task = threads_per_task(8,3); + let threads_per_task = threads_per_task(8, 3); assert!(threads_per_task == 2); } #[test] fn test_threads_per_task_splits_odd_threads_correctly() { - let threads_per_task = threads_per_task(10,3); + let threads_per_task = threads_per_task(10, 3); assert!(threads_per_task == 3); } } diff --git a/src/file_io.rs b/src/file_io.rs index ec174ef..1a528da 100644 --- a/src/file_io.rs +++ b/src/file_io.rs @@ -1,6 +1,6 @@ use super::umi_errors::RuntimeErrors; use anyhow::{anyhow, Context, Result}; -use bio::io::fastq::{Record, Writer as FastqWriter}; +use bio::io::fastq::{Reader as FastqReader, Record, Writer as FastqWriter}; use dialoguer::{theme::ColorfulTheme, Confirm}; use file_format::FileFormat; use gzp::{deflate::Gzip, par::compress::Compression, ZBuilder, ZWriter}; @@ -46,7 +46,7 @@ pub fn read_fastq(path: &PathBuf) -> Result Result { } // Set the number of threads to max, unless manually specified. In case of failure, use only 1. - let num_threads = args.num_threads.unwrap_or_else(|| { - threads_available() - }); + let num_threads = args.num_threads.unwrap_or_else(|| threads_available()); // Determine the number of threads available for output file compression. let threads_per_task = threads_per_task(num_threads, 2); From 8bffbced3c4717285888c4eeb204cff0efeb8621 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 22 May 2024 19:32:15 +0200 Subject: [PATCH 03/15] Prepare the addition of binary test files to tests. --- tests/auxiliary.rs | 29 +++++++++++++++++++ ...integration_tests_external_filecontents.rs | 29 ++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/tests/auxiliary.rs b/tests/auxiliary.rs index c6720b3..b889524 100644 --- a/tests/auxiliary.rs +++ b/tests/auxiliary.rs @@ -4,6 +4,7 @@ use assert_fs::fixture::{NamedTempFile, TempDir}; use assert_fs::prelude::*; use predicates::prelude::*; use std::path::PathBuf; +use std::io::Read; // since those are just needed for the tests, I didn't put it in src. Therefore, using this module is not detected and dead_code warnings issued. #[derive()] @@ -124,3 +125,31 @@ pub fn verify_file_contents(test_file: &PathBuf, reference_file: &PathBuf) -> Re )) } } + + +// Function to compare two files, used to test if the program output matches the reference. +#[allow(dead_code)] +pub fn verify_file_binary(test_file: &PathBuf, reference_file: &PathBuf) -> Result { + + let mut test_file_buf: Vec = Vec::new(); + let mut reference_file_buf: Vec = Vec::new(); + + let mut test_file_handle = std::fs::File::open(&test_file) + .map_err(|err| anyhow!("Failed to read test file: {}", err))?; + let mut reference_file_handle = std::fs::File::open(&reference_file) + .map_err(|err| anyhow!("Failed to read reference file: {}", err))?; + + test_file_handle.read_to_end(&mut test_file_buf)?; + reference_file_handle.read_to_end(&mut reference_file_buf)?; + + if test_file_buf == reference_file_buf { + Ok(true) + } else { + Err(anyhow!( + "{} and {} did not match!", + reference_file.file_name().unwrap().to_string_lossy(), + test_file.file_name().unwrap().to_string_lossy() + )) + } +} + diff --git a/tests/integration_tests_external_filecontents.rs b/tests/integration_tests_external_filecontents.rs index 8ba514a..8178a8c 100644 --- a/tests/integration_tests_external_filecontents.rs +++ b/tests/integration_tests_external_filecontents.rs @@ -1,5 +1,5 @@ use assert_fs::prelude::*; -use auxiliary::verify_file_contents; +use auxiliary::{verify_file_binary,verify_file_contents}; use predicates::prelude::*; use std::error::Error; @@ -33,6 +33,33 @@ fn testing_file_verification_fails() { } // Yep, verify_file_contents() does its job. Ready to rumble! +// Do the same for binary files. + +#[test] +fn testing_file_comparison_succeeds() -> TestResult { + let (mut _cmd, temp_dir, test_files, _test_output) = auxiliary::setup_integration_test(false); + + // the same file should be identical + verify_file_binary(&test_files.read1, &test_files.read1)?; + + temp_dir.close()?; + Ok(()) +} + +#[test] +#[should_panic(expected = "read2.fq and read1.fq did not match!")] +fn testing_file_comparison_fails() { + let (mut _cmd, temp_dir, test_files, _test_output) = auxiliary::setup_integration_test(false); + + // the same file should be identical + verify_file_binary(&test_files.read1, &test_files.read2).unwrap(); + + temp_dir.close().unwrap(); +} + +// Yep, verify_file_contents() does its job. Ready to rumble! + + #[test] fn external_produces_correct_output() -> TestResult { From 8d00d87c4ef4acdb3a23837cd11fc5226bbf424d Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 22 May 2024 20:07:14 +0200 Subject: [PATCH 04/15] Test for compressed output files as well. --- tests/auxiliary.rs | 6 ++- ...integration_tests_external_filecontents.rs | 38 ++++++++++++++++++ tests/results/correct_read1.fq.gz | Bin 0 -> 647 bytes tests/results/correct_read2.fq.gz | Bin 0 -> 682 bytes 4 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 tests/results/correct_read1.fq.gz create mode 100644 tests/results/correct_read2.fq.gz diff --git a/tests/auxiliary.rs b/tests/auxiliary.rs index b889524..54cd09b 100644 --- a/tests/auxiliary.rs +++ b/tests/auxiliary.rs @@ -32,6 +32,8 @@ pub struct TestOutput { // Struct to hold the paths to validated output files. pub correct_read1: PathBuf, pub correct_read2: PathBuf, + pub compressed_correct_read1: PathBuf, + pub compressed_correct_read2: PathBuf, pub corrected_read1: PathBuf, pub corrected_read2: PathBuf, pub delim_underscore_read1: PathBuf, @@ -65,7 +67,7 @@ pub fn setup_integration_test( std::env::current_dir() .expect("Failed to get directory") .join("./tests/results"), - &["*.fq"], + &["*.fq", "*.gz"], ) .expect("Failed to copy result data to temporary directory."); }; @@ -90,6 +92,8 @@ pub fn setup_integration_test( let temp = TestOutput { correct_read1: temp_dir.path().join("correct_read1.fq"), correct_read2: temp_dir.path().join("correct_read2.fq"), + compressed_correct_read1: temp_dir.path().join("correct_read1.fq.gz"), + compressed_correct_read2: temp_dir.path().join("correct_read2.fq.gz"), corrected_read1: temp_dir.path().join("corrected_read1.fq"), corrected_read2: temp_dir.path().join("corrected_read2.fq"), delim_underscore_read1: temp_dir.path().join("delim_underscore_read1.fq"), diff --git a/tests/integration_tests_external_filecontents.rs b/tests/integration_tests_external_filecontents.rs index 8178a8c..39bb68b 100644 --- a/tests/integration_tests_external_filecontents.rs +++ b/tests/integration_tests_external_filecontents.rs @@ -213,3 +213,41 @@ fn external_switch_umi_and_read2() -> TestResult { temp_dir.close()?; Ok(()) } + +#[test] +fn external_produces_correct_compressed_output() -> TestResult { + let (mut cmd, temp_dir, test_files, test_output) = auxiliary::setup_integration_test(true); + cmd.arg("external") + .arg("--in") + .arg(test_files.read1) + .arg("--in2") + .arg(test_files.read2) + .arg("--umi") + .arg(test_files.umi) + .arg("--gzip"); + + cmd.assert().success(); //further assertions have been tested in other tests + + temp_dir + .child("read1_with_UMIs.fq.gz") + .assert(predicate::path::exists()); + + temp_dir + .child("read2_with_UMIs.fq.gz") + .assert(predicate::path::exists()); + + let reference = test_output.unwrap(); + + verify_file_binary( + &temp_dir.child("read1_with_UMIs.fq.gz").to_path_buf(), + &reference.compressed_correct_read1, + )?; + + verify_file_binary( + &temp_dir.child("read2_with_UMIs.fq.gz").to_path_buf(), + &reference.compressed_correct_read2, + )?; + + temp_dir.close()?; + Ok(()) +} \ No newline at end of file diff --git a/tests/results/correct_read1.fq.gz b/tests/results/correct_read1.fq.gz new file mode 100644 index 0000000000000000000000000000000000000000..cfbfbacce36c207c2a45fa61ce2eb42a7d95fc81 GIT binary patch literal 647 zcmV;20(ku&iwFP!00000|IL=cP8&fGMDPBJNI7!Z@!AN_iB2mnArUV0|9?yLs_jTQ zutW|>SoX3W&kX7}Rn`0WnorM9=bz8$$1?6`W+@bz z-^P5I%T!inNUFOCsY0pRMN}$zhpHli@hWBbK2=g?Rh0OhSy9SNuVYOr8TU8wr+6_? zm0l(7?sn(z=g)ub9=GxAkH?46ct8`S*v119K?6iEfQo{}aEUT5Rd65a9TFPg^N36| zR4h=5)pc2D(2D$6Jo9Ee>sXfi#zU!;yp2c2X=j-PBY*UQCld;fzqn5EP_JWk+Q8ShY>*p4{i*`1Z^(N z%72Xd&+M6RkIcWnzKdu7uh<>5vX~1ILXBu6b*xIZf1-qi5kk&v%X!!C1;pvx=Np zo$qXnp1^byowp&}PUmnSnAb5a1%Y)% zj>A}db*lS6(#ad?oIjq#-3ULnZEaX?_Zv1d8?!SD008B6JRtx8 literal 0 HcmV?d00001 diff --git a/tests/results/correct_read2.fq.gz b/tests/results/correct_read2.fq.gz new file mode 100644 index 0000000000000000000000000000000000000000..4dcaf7638cd3bf1148dcfd2d26b0b9ff0e54600e GIT binary patch literal 682 zcmV;b0#*GViwFP!00000|IL=oP9rf8gztHZy|agLCW#WC=xr@6q%0Tu{hwv^RXd0c zFay~OM#}ha9Q?Veocm{gJU>4CKHIOppQip4kI#Aj(_3$CYQ3FKr(JJTJ9PI93v>T5 z_oqJf@JzRe$UC!$wD8D{rCXYrdAO&iS$afzFxQ=aEfYykP1Fk-b6yWOrtp2~G*TZL z)4Ca(yUoMRrN3$PxBXQ%_se*;`~9&M9?Ni9|y^YrC(M0OKla;_Bd}Zc<_RSEt@OYYY;3> zAS*#&0x0<@6B&`c;-oMFA<|Li@LBx|_e&5(a6H6Xd?uRv*N6Mh>yFiEp5?8%^&yE| z77ts6j4mmhMB(xxiujXXl8q{fsMWQ`T0pE|uPaSb8h+y20fyTCp>tyb^~H99TBWnw z5zKSzY+8}9oII+^11Lrpoy4G9`z3hA1c6*`BB?~K;^}&kRN_-nb+MLMewXCgZ+EAn zqf|y3D|9pmDaal5q~;VUK%7VV7$7QDq#15|7F9l18b|4m{GyZb+v)V7iqB16qtm9{ zd>q!&>BU;`U?Zh!ox3)vAB7w}x~j+_m81y`$c~yN+c8>CxeO219mfPv-GZkREZ4sW QS8TcY3*-h8Ml%Zl0J8f>VgLXD literal 0 HcmV?d00001 From f774bf1320082518114101523be5707df4775bcd Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 22 May 2024 20:24:16 +0200 Subject: [PATCH 05/15] Create test for compression level setting. --- tests/auxiliary.rs | 4 + ...integration_tests_external_filecontents.rs | 80 ++++++++++++++++++ tests/results/correct_read1_lvl9.fq.gz | Bin 0 -> 650 bytes tests/results/correct_read2_lvl9.fq.gz | Bin 0 -> 679 bytes 4 files changed, 84 insertions(+) create mode 100644 tests/results/correct_read1_lvl9.fq.gz create mode 100644 tests/results/correct_read2_lvl9.fq.gz diff --git a/tests/auxiliary.rs b/tests/auxiliary.rs index 54cd09b..1a39d48 100644 --- a/tests/auxiliary.rs +++ b/tests/auxiliary.rs @@ -34,6 +34,8 @@ pub struct TestOutput { pub correct_read2: PathBuf, pub compressed_correct_read1: PathBuf, pub compressed_correct_read2: PathBuf, + pub more_compressed_correct_read1: PathBuf, + pub more_compressed_correct_read2: PathBuf, pub corrected_read1: PathBuf, pub corrected_read2: PathBuf, pub delim_underscore_read1: PathBuf, @@ -94,6 +96,8 @@ pub fn setup_integration_test( correct_read2: temp_dir.path().join("correct_read2.fq"), compressed_correct_read1: temp_dir.path().join("correct_read1.fq.gz"), compressed_correct_read2: temp_dir.path().join("correct_read2.fq.gz"), + more_compressed_correct_read1: temp_dir.path().join("correct_read1_lvl9.fq.gz"), + more_compressed_correct_read2: temp_dir.path().join("correct_read2_lvl9.fq.gz"), corrected_read1: temp_dir.path().join("corrected_read1.fq"), corrected_read2: temp_dir.path().join("corrected_read2.fq"), delim_underscore_read1: temp_dir.path().join("delim_underscore_read1.fq"), diff --git a/tests/integration_tests_external_filecontents.rs b/tests/integration_tests_external_filecontents.rs index 39bb68b..ef2d170 100644 --- a/tests/integration_tests_external_filecontents.rs +++ b/tests/integration_tests_external_filecontents.rs @@ -248,6 +248,86 @@ fn external_produces_correct_compressed_output() -> TestResult { &reference.compressed_correct_read2, )?; + temp_dir.close()?; + Ok(()) +} + +#[test] +fn external_produces_correct_compressed_output_mod_compression_level() -> TestResult { + let (mut cmd, temp_dir, test_files, test_output) = auxiliary::setup_integration_test(true); + cmd.arg("external") + .arg("--in") + .arg(test_files.read1) + .arg("--in2") + .arg(test_files.read2) + .arg("--umi") + .arg(test_files.umi) + .arg("--compression_level") + .arg("9") + .arg("--gzip"); + + cmd.assert().success(); //further assertions have been tested in other tests + + temp_dir + .child("read1_with_UMIs.fq.gz") + .assert(predicate::path::exists()); + + temp_dir + .child("read2_with_UMIs.fq.gz") + .assert(predicate::path::exists()); + + let reference = test_output.unwrap(); + + verify_file_binary( + &temp_dir.child("read1_with_UMIs.fq.gz").to_path_buf(), + &reference.more_compressed_correct_read1, + )?; + + verify_file_binary( + &temp_dir.child("read2_with_UMIs.fq.gz").to_path_buf(), + &reference.more_compressed_correct_read2, + )?; + + temp_dir.close()?; + Ok(()) +} + +#[test] +fn external_produces_correct_compressed_output_thread_limit() -> TestResult { + let (mut cmd, temp_dir, test_files, test_output) = auxiliary::setup_integration_test(true); + cmd.arg("external") + .arg("--in") + .arg(test_files.read1) + .arg("--in2") + .arg(test_files.read2) + .arg("--umi") + .arg(test_files.umi) + .arg("--threads") + .arg("3") + .arg("--gzip"); + + cmd.assert().success(); //further assertions have been tested in other tests + + temp_dir + .child("read1_with_UMIs.fq.gz") + .assert(predicate::path::exists()); + + temp_dir + .child("read2_with_UMIs.fq.gz") + .assert(predicate::path::exists()); + + let reference = test_output.unwrap(); + + verify_file_binary( + &temp_dir.child("read1_with_UMIs.fq.gz").to_path_buf(), + &reference.compressed_correct_read1, + )?; + + verify_file_binary( + &temp_dir.child("read2_with_UMIs.fq.gz").to_path_buf(), + &reference.compressed_correct_read2, + )?; + temp_dir.close()?; Ok(()) } \ No newline at end of file diff --git a/tests/results/correct_read1_lvl9.fq.gz b/tests/results/correct_read1_lvl9.fq.gz new file mode 100644 index 0000000000000000000000000000000000000000..91af50ee93816267cc6c2593f4002ac97f44942e GIT binary patch literal 650 zcmV;50(Jc#iwFP!00002|IL=oZsRZvgztWe0(nyqr5F?OfTa=?u!=Yb3{BpgVe_!pj9s0h#$K@lhZ*6G9&{wl8%iOAWZ0;Er=KeIa z_tv-YOt*;0XRA|Mcw}dm9*&~)d^?tJP6qDbIPCxsMm943Vn z$BF|oL`g4_SrG!T`Ug;Cz2-sGGY!YF7apv!HrQhyPcR9%|41%T(ho}AhPWyU!6VG^>#tn~C;C4Uc1NKD_N)9z5X=J_Z(ltL{jZfHl@;4~^%nPF~mc`V~B>nzyD zNLKC;dq|6#V}>Dk%T(6?kSrg?c@E>ehz{3+?YpHNErJ@NZfMd_9-LRu_^Bh8b%jzz z>#)=$%0lc1j6G#j!hUx;?ceAO{XCu4)WSmXnYNE-D@!a>mg~1*%8cnVEa*;?L8h?4 k8$B&(Ek@IUc9?w5%akFh5dLi0+DWmI!voi|-02O{dv;Y7A literal 0 HcmV?d00001 diff --git a/tests/results/correct_read2_lvl9.fq.gz b/tests/results/correct_read2_lvl9.fq.gz new file mode 100644 index 0000000000000000000000000000000000000000..86bcea9605daba67557196085aad43e2a2dd36cf GIT binary patch literal 679 zcmV;Y0$BYYiwFP!00002|IL<5ZsR}L_VGfpWWnYt9AZou@MII-|185QwgOB% zk!&Z62!THRi}+Mk?Uy$@U(ToB7yTvsu9H_kzs2Q`^wRfTo{q=kD0w%DS-85IJq_|I zow!Aqx_dlpUP9eGHkxgrd6?$}%iFN*%&r`3hITHK=WMBgoWMR)4pR)l|I2y#5InJu z>xOThH>ZaWxq6W6_Bu8%%XoJC{h=>BK!f*Q!J{aQXn@6AWf+hc@?@wsk_4$_Pu>S+ zlS&(wE!h;nmWYA1Ez*d83C|tJ8lIu=x~cHw8{^Zh;(-kKp&fe25;2OO-Ff3Im*OZf z<%}vGeYi)9I&Vk@D_L8FdTV&(8vw{>vbqt^wCf5F3SyOuc;E-8duSkv2UXEGoRRV% zH9S)|QN7xX2U%e{#Y+2mUEsAOhsU3zp6NI=b0Gxqu8Vk3D(Xf3#1n)?MbRk~1$~7q z;}YgAdtgW_q@lGU(KsGUmPWq7^WVS_`8-J06(sK$xe?FqFzjYL@It=kvbh|!x-(J? zbRL#~R%b$`w``vgfI#wTj zl@7-k_v`xZZ8J9SOaX?w@uH9CDa4u@tf zem!3c9<8{Uzq?fq@yA_lKovPgw;AQrf{LSPJM`Ao>K6dFdyqAn@u6)?5G)^m53bm9 N^B3d>5=Ju%007C)NlE|! literal 0 HcmV?d00001 From ec8852cf2d2b68b0215730b4071f4d8163361d28 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 22 May 2024 21:22:45 +0200 Subject: [PATCH 06/15] Readme updates to accomodate the new multi-threaded output compression. --- README.md | 122 ++++++++++++++++++++------------------------ src/umi_external.rs | 2 +- 2 files changed, 57 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 8292e3c..728ddc1 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ - [Background on Unique Molecular Identifiers](#background) - [Installing `umi-transfer`](#installation) - [Using `umi-transfer` to integrate UMIs](#usage) -- [Improving performance with external multi-threaded compression](#high-performance-guide) +- [Chaining with other software](#chaining-with-other-software) - [Contributing bugfixes and new features](#contribution-guide-for-developers)
@@ -94,132 +94,122 @@ That should create an executable `target/release/umi-transfer` that can be place ```shell ./target/release/umi-transfer --version -umi-transfer 1.0.0 +umi-transfer 1.5.0 ``` ## Usage ->### Performance Note -> ->The decompression and compression used within umi-transfer is single-threaded, so to get the most reads per minute performance, see the [high performance guide](#high-performance-guide) +The tool requires three FastQ files as input. You can manually specify the names and location of the output files with `--out` and `--out2` or the tool will automatically append a `with_UMI` suffix to your input file names. It additionally accepts to choose a custom UMI delimiter with `--delim` and to set the flags `-f`, `-c` and `-z`. -The tool requires three FastQ files as input. You can manually specify the names and location of the output files with `--out` and `--out2` or the tool will append a `with_UMI` suffix to your input file names as output. It additionally accepts to choose a custom UMI delimiter with `--delim` and to set the flags `-f`, `-c` and `-z`. - -`-c` is used to ensure the canonical `1` and `2` of paired files as read numbers in the output, regardless of the read numbers of the input reads. `-f` / `--force` will overwrite existing output files without prompting the user and `-c` enables the internal single-threaded compression of the output files. Alternatively, you can also specify an output file name with `.gz` suffix to obtain compressed output. +`-c` is used to ensure the canonical `1` and `2` of paired files as read numbers in the output, regardless of the read numbers of the input reads. `-f` / `--force` will overwrite existing output files without prompting the user and `-z` enables the internal compression of the output files. Alternatively, you can also specify an output file name with `.gz` suffix to obtain compressed output. ```raw $ umi-transfer external --help - umi-transfer-external + + Integrate UMIs from a separate FastQ file -USAGE: - umi-transfer external [OPTIONS] --in --in2 --umi +Usage: umi-transfer external [OPTIONS] --in --in2 --umi -OPTIONS: - -c, --correct_numbers Read numbers will be altered to ensure the canonical read numbers 1 and 2 in output file sequence headers. +Options: + -c, --correct_numbers + Read numbers will be altered to ensure the canonical read numbers 1 and 2 in output file sequence headers. - -d, --delim Delimiter to use when joining the UMIs to the read name. Defaults to `:`. - -f, --force Overwrite existing output files without further warnings or prompts. + -z, --gzip + Compress output files. Turned off by default. - -h, --help Print help information - --in [REQUIRED] Input file 1 with reads. + -l, --compression_level + Choose the compression level: Maximum 9, defaults to 3. Higher numbers result in smaller files but take longer to compress. - --in2 [REQUIRED] Input file 2 with reads. + -t, --threads + Number of threads to use for processing. Defaults to the number of logical cores available. - --out Path to FastQ output file for R1. + -f, --force + Overwrite existing output files without further warnings or prompts. - --out2 Path to FastQ output file for R2. + -d, --delim + Delimiter to use when joining the UMIs to the read name. Defaults to `:`. - -u, --umi [REQUIRED] Input file with UMI. - -z, --gzip Compress output files. By default, turned off in favour of external compression. -``` + --in + [REQUIRED] Input file 1 with reads. -### Example -A run with just the mandatory arguments may look like this: + --in2 + [REQUIRED] Input file 2 with reads. -```shell -umi-transfer external -fz -d '_' --in 'R1.fastq' --in2 'R3.fastq' --umi 'R2.fastq' -``` -`umi-transfer` warrants paired input files. To run on singletons, use the same input twice and redirect one output to `/dev/null`: + -u, --umi + [REQUIRED] Input file with UMI. -```shell -umi-transfer external --in read1.fastq --in2 read1.fastq --umi read2.fastq --out output1.fastq --out2 /dev/null -``` -### High Performance Guide + --out + Path to FastQ output file for R1. -The performance bottleneck of UMI integration is output file compression. [Parallel Gzip](https://github.com/madler/pigz) can be used on modern multi-processor, multi-core machines to significantly outclass the single-threaded compression that ships with `umi-transfer`. -We recommend using Unix FIFOs (First In, First Out buffered pipes) to combine `umi-transfer` and `pigz` on GNU/Linux and MacOS operating systems: + --out2 + Path to FastQ output file for R2. -```shell -mkfifo read1.fastq -mkfifo read2.fastq -mkfifo read3.fastq + + -h, --help + Print help + -V, --version + Print version ``` -Assuming your compressed input files are called `read1.fastq.gz` and `read2.fastq.gz` and `read3.fastq.gz`, each can be linked to its respective FIFO like so: +### Example + +A typical run may look like this: ```shell -$ pigz -dc read1.fastq.gz > read1.fastq & -[1] 233387 -$ pigz -dc read2.fastq.gz > read2.fastq & -[2] 233388 -$ pigz -dc read3.fastq.gz > read3.fastq & -[3] 233389 +umi-transfer external -fz -d '_' --in 'R1.fastq' --in2 'R3.fastq' --umi 'R2.fastq' ``` -Note the trailing `&` to leave these processes running in the background. Since multi-threading is hardly helpful for decompression, you could also use `zcat` or `gzip -dc` instead of `pigz -dc` here. - -We can inspect the directory with `ls` to list the compressed files and the created FIFOs: +`umi-transfer` warrants paired input files. To run on singletons, use the same input twice and redirect one output to `/dev/null`: ```shell -$ ls -lh -total 1.5K --rw-rw----. 1 alneberg ngisweden 4.5G Apr 13 12:18 read1.fastq.gz --rw-rw----. 1 alneberg ngisweden 1.1G Apr 13 12:18 read2.fastq.gz --rw-rw----. 1 alneberg ngisweden 4.5G Apr 13 12:18 read3.fastq.gz -prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read1.fastq -prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read2.fastq -prw-rw-r--. 1 alneberg ngisweden 0 Apr 13 12:46 read3.fastq +umi-transfer external --in read1.fastq --in2 read1.fastq --umi read2.fastq --out output1.fastq --out2 /dev/null ``` -We continue to create FIFOs for the output files: +### Chaining with other software + +`umi-transfer` cannot be used with the pipe operator, because it neither supports writing output to `stdout` nor reading input from `stdin`. However, FIFOs (_First In, First Out buffered pipes_) can be used to elegantly combine `umi-transfer` with other software on GNU/Linux and MacOS operating systems. + +For example, we may want to use external compression software like [Parallel Gzip](https://github.com/madler/pigz) together with `umi-transfer`. For this purpose, it would be unfavorable to write the data uncompressed to disk before compressing it. Instead, we create named pipes with `mkfifo`, which can be provided to `umi-transfer` as if they were regular output file paths. In reality, the data is directly passed on to `pigz` via a buffered stream. + +First, the named pipes are created: ```shell -mkfifo output1.fastq -mkfifo output2.fastq +mkfifo output1 +mkfifo output2 ``` -and set-up a multi-threaded `pigz` compression process each: +Then a multi-threaded `pigz` compression is tied to the FIFO. Note the trailing `&` to leave these processes running in the background. ```shell -$ pigz -p 10 -c > output1.fastq.gz < output1.fastq & +$ pigz -p 10 -c > output1.fastq.gz < output1 & [4] 233394 -$ pigz -p 10 -c > output2.fastq.gz < output2.fastq & +$ pigz -p 10 -c > output2.fastq.gz < output2 & [5] 233395 ``` The argument `-p 10` specifies the number of threads that each `pigz` processes may use. The optimal setting is hardware-specific and will require some testing. -Finally, we can then run `umi-transfer` using the FIFOs like so: +Finally, we can run `umi-transfer` using the FIFOs as output paths: ```shell -umi-transfer external --in read1.fastq --in2 read3.fastq --umi read2.fastq --out output1.fastq --out2 output2.fastq +umi-transfer external --in read1.fastq --in2 read3.fastq --umi read2.fastq --out output1 --out2 output2 ``` It's good practice to remove the FIFOs after the program has finished: ```shell -rm read1.fastq read2.fastq read3.fastq output1.fastq output2.fastq +rm output1.fastq output2.fastq ``` ## Contribution guide for developers diff --git a/src/umi_external.rs b/src/umi_external.rs index 45c028b..ea17c97 100644 --- a/src/umi_external.rs +++ b/src/umi_external.rs @@ -18,7 +18,7 @@ pub struct OptsExternal { #[clap( short = 'z', long = "gzip", - help = "Compress output files. By default, turned off in favour of external compression. + help = "Compress output files. Turned off by default. \n " )] gzip: bool, From 690e31f7e741db3964d87f59a4133130096137b6 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 22 May 2024 21:23:33 +0200 Subject: [PATCH 07/15] Code formatting. --- .github/workflows/container.yml | 4 ++-- .github/workflows/testing.yml | 2 +- tests/auxiliary.rs | 5 +---- tests/integration_tests_external_filecontents.rs | 6 ++---- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml index ead90ef..a233b27 100644 --- a/.github/workflows/container.yml +++ b/.github/workflows/container.yml @@ -27,13 +27,13 @@ jobs: echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} - name: Log in to Docker Hub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Login to GitHub Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.repository_owner }} diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index 9c08743..04b3d46 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -88,7 +88,7 @@ jobs: continue-on-error: true - name: Create an artifact from clippy results - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ClippyResults path: rust-clippy-results.sarif diff --git a/tests/auxiliary.rs b/tests/auxiliary.rs index 1a39d48..936e100 100644 --- a/tests/auxiliary.rs +++ b/tests/auxiliary.rs @@ -3,8 +3,8 @@ use assert_cmd::Command; use assert_fs::fixture::{NamedTempFile, TempDir}; use assert_fs::prelude::*; use predicates::prelude::*; -use std::path::PathBuf; use std::io::Read; +use std::path::PathBuf; // since those are just needed for the tests, I didn't put it in src. Therefore, using this module is not detected and dead_code warnings issued. #[derive()] @@ -134,11 +134,9 @@ pub fn verify_file_contents(test_file: &PathBuf, reference_file: &PathBuf) -> Re } } - // Function to compare two files, used to test if the program output matches the reference. #[allow(dead_code)] pub fn verify_file_binary(test_file: &PathBuf, reference_file: &PathBuf) -> Result { - let mut test_file_buf: Vec = Vec::new(); let mut reference_file_buf: Vec = Vec::new(); @@ -160,4 +158,3 @@ pub fn verify_file_binary(test_file: &PathBuf, reference_file: &PathBuf) -> Resu )) } } - diff --git a/tests/integration_tests_external_filecontents.rs b/tests/integration_tests_external_filecontents.rs index ef2d170..eb32ad1 100644 --- a/tests/integration_tests_external_filecontents.rs +++ b/tests/integration_tests_external_filecontents.rs @@ -1,5 +1,5 @@ use assert_fs::prelude::*; -use auxiliary::{verify_file_binary,verify_file_contents}; +use auxiliary::{verify_file_binary, verify_file_contents}; use predicates::prelude::*; use std::error::Error; @@ -59,8 +59,6 @@ fn testing_file_comparison_fails() { // Yep, verify_file_contents() does its job. Ready to rumble! - - #[test] fn external_produces_correct_output() -> TestResult { let (mut cmd, temp_dir, test_files, test_output) = auxiliary::setup_integration_test(true); @@ -330,4 +328,4 @@ fn external_produces_correct_compressed_output_thread_limit() -> TestResult { temp_dir.close()?; Ok(()) -} \ No newline at end of file +} From e058ec6440bf701af2dfd6f3874a635601e85843 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Wed, 22 May 2024 21:52:49 +0200 Subject: [PATCH 08/15] Bump version in main. --- src/main.rs | 4 ++-- src/umi_external.rs | 2 +- tests/auxiliary.rs | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index d8e0598..8b09ced 100644 --- a/src/main.rs +++ b/src/main.rs @@ -25,8 +25,8 @@ https://github.com/SciLifeLab/umi-transfer #[derive(clap::Parser)] #[clap( - version = "1.5.0dev", - author = "Written by Judit Hohenthal, Matthias Zepper & Johannes Alneberg", + version = "1.5.0", + author = "Written by Matthias Zepper, Judit Hohenthal & Johannes Alneberg", about = "A tool for transferring Unique Molecular Identifiers (UMIs).", long_about = "Most tools capable of using UMIs to increase the accuracy of quantitative DNA sequencing experiments expect the respective UMI sequence to be embedded into the reads' IDs. You can use `umi-transfer external` to retrieve UMIs from a separate FastQ file and embed them to the IDs of your paired FastQ files." )] diff --git a/src/umi_external.rs b/src/umi_external.rs index ea17c97..21ca7f1 100644 --- a/src/umi_external.rs +++ b/src/umi_external.rs @@ -101,7 +101,7 @@ pub fn run(args: OptsExternal) -> Result { } // Set the number of threads to max, unless manually specified. In case of failure, use only 1. - let num_threads = args.num_threads.unwrap_or_else(|| threads_available()); + let num_threads = args.num_threads.unwrap_or_else(threads_available); // Determine the number of threads available for output file compression. let threads_per_task = threads_per_task(num_threads, 2); diff --git a/tests/auxiliary.rs b/tests/auxiliary.rs index 936e100..13ab695 100644 --- a/tests/auxiliary.rs +++ b/tests/auxiliary.rs @@ -140,9 +140,9 @@ pub fn verify_file_binary(test_file: &PathBuf, reference_file: &PathBuf) -> Resu let mut test_file_buf: Vec = Vec::new(); let mut reference_file_buf: Vec = Vec::new(); - let mut test_file_handle = std::fs::File::open(&test_file) + let mut test_file_handle = std::fs::File::open(test_file) .map_err(|err| anyhow!("Failed to read test file: {}", err))?; - let mut reference_file_handle = std::fs::File::open(&reference_file) + let mut reference_file_handle = std::fs::File::open(reference_file) .map_err(|err| anyhow!("Failed to read reference file: {}", err))?; test_file_handle.read_to_end(&mut test_file_buf)?; From 2e575a79d980db3ec2f71ab587f9266d7ebd6073 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Thu, 23 May 2024 15:13:24 +0200 Subject: [PATCH 09/15] Bump Docker build action. --- .github/workflows/container.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml index a233b27..cf3dc02 100644 --- a/.github/workflows/container.yml +++ b/.github/workflows/container.yml @@ -40,7 +40,7 @@ jobs: password: ${{ secrets.GITHUB_TOKEN }} - name: Push dev image - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 if: github.event_name == 'push' with: push: true @@ -49,7 +49,7 @@ jobs: ghcr.io/${{ env.REPO_LOWERCASE }}:dev - name: Push release image - uses: docker/build-push-action@v3 + uses: docker/build-push-action@v5 if: github.event_name == 'release' with: push: true From d568f01dfc8ee06c6741aa38b17daaa75f279fae Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Fri, 24 May 2024 21:40:26 +0200 Subject: [PATCH 10/15] Update Debian disto in Docker image from bullseye to bookworm (stable). --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index de1b22b..3f8a522 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:latest as buildenv +FROM rust:bookworm as buildenv WORKDIR /usr/app/src COPY ./ /usr/app/src @@ -11,7 +11,7 @@ RUN --mount=type=cache,target=/usr/local/cargo/registry \ --mount=type=cache,target=/rust/target \ cargo build --release -FROM debian:bullseye-slim as runner +FROM debian:bookworm-slim as runner WORKDIR /root COPY --from=buildenv /usr/app/src/target/release/ /usr/local/bin/ RUN chmod 755 /usr/local/bin/umi-transfer From caaaf2bb663d00181a8cd6a8b5c492855c25a90f Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Fri, 24 May 2024 21:49:30 +0200 Subject: [PATCH 11/15] Push dev container image also when workflow is manually dispatched. --- .github/workflows/container.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml index cf3dc02..39062f7 100644 --- a/.github/workflows/container.yml +++ b/.github/workflows/container.yml @@ -41,7 +41,7 @@ jobs: - name: Push dev image uses: docker/build-push-action@v5 - if: github.event_name == 'push' + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' with: push: true tags: | From 152c1b4b568b584c33d2ea5986f41cca864bbfb8 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 27 May 2024 20:04:55 +0200 Subject: [PATCH 12/15] Added benchmark results to readme. --- README.md | 19 + docs/img/benchmark_umi-transfer-threads.svg | 616 ++++++++++++++ docs/img/benchmark_umi-transfer-version.svg | 877 ++++++++++++++++++++ 3 files changed, 1512 insertions(+) create mode 100644 docs/img/benchmark_umi-transfer-threads.svg create mode 100644 docs/img/benchmark_umi-transfer-version.svg diff --git a/README.md b/README.md index 728ddc1..2460b72 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ - [Background on Unique Molecular Identifiers](#background) - [Installing `umi-transfer`](#installation) - [Using `umi-transfer` to integrate UMIs](#usage) +- [Benchmarks and parameter recommendations](#benchmarks-and-parameter-recommendations) - [Chaining with other software](#chaining-with-other-software) - [Contributing bugfixes and new features](#contribution-guide-for-developers) @@ -176,6 +177,24 @@ umi-transfer external -fz -d '_' --in 'R1.fastq' --in2 'R3.fastq' --umi 'R2.fast umi-transfer external --in read1.fastq --in2 read1.fastq --umi read2.fastq --out output1.fastq --out2 /dev/null ``` +### Benchmarks and parameter recommendations + +A known shortcoming of version 1.0 of `umi-transfer` was the purely single-threaded output file compression, which significantly slowed down the tool. To mitigate this, we recommended using FIFOs and piping the uncompressed output to a dedicated compression tool like [`pigz`](https://github.com/madler/pigz). + +With the release of version 1.5, `umi-transfer` features internal multi-threaded output compression. As a result, `umi-transfer` 1.5 now runs approximately 25 times faster than version 1.0 when using internal compression and about twice as fast compared to using an external compression tool. This improvement is enabled by the outstanding [`gzp` crate](https://github.com/sstadick/gzp), which abstracts a lot of the underlying complexity away from the main software. + +![Benchmark of different tool versions](docs/img/benchmark_umi-transfer-version.svg) + +In our first benchmark using 17 threads, version 1.5 of `umi-transfer` processed approximately 550,000 paired records per second with the default gzip compression level of 3. At the highest compression level of 9, the rate dropped to just below 200,000 records per second. While the exact numbers may vary depending on your storage, file system, and processors, we expect the relative performance rates to remain approximately constant. + +![Benchmark of thread numbers](docs/img/benchmark_umi-transfer-threads.svg) + +In a subsequent benchmark, we tested the effect of increasing the number of threads. For the default compression level, the maximum speed was achieved with 9 to 11 threads. Since umi-transfer writes two output files simultaneously, this configuration allows for 4 to 5 threads per file to handle the output compression. + +Adding more threads per file proved unhelpful, as other steps became the rate-limiting factors. These factors include file system I/O, input file decompression, and the actual editing of the file contents, which now determine the performance of umi-transfer. Only when increasing the compression level to higher settings did adding more threads continue to provide a performance benefit. For the highest compression setting, we did not reach the plateau phase during the benchmark, but it is likely to occur in the range of 53-55 total threads, or about 26 threads per output file. + +**In summary, we recommend running `umi-transfer` with 9 or 11 threads for compression. Odd numbers are favorable as they allow one dedicated main thread, while evenly splitting the remaining threads between the two output files. It's important to note that specifying more threads than the available physical or logical cores on your machine will result in a severe performance loss, since `umi-transfer` operates synchronously.** + ### Chaining with other software `umi-transfer` cannot be used with the pipe operator, because it neither supports writing output to `stdout` nor reading input from `stdin`. However, FIFOs (_First In, First Out buffered pipes_) can be used to elegantly combine `umi-transfer` with other software on GNU/Linux and MacOS operating systems. diff --git a/docs/img/benchmark_umi-transfer-threads.svg b/docs/img/benchmark_umi-transfer-threads.svg new file mode 100644 index 0000000..fef09e6 --- /dev/null +++ b/docs/img/benchmark_umi-transfer-threads.svg @@ -0,0 +1,616 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/img/benchmark_umi-transfer-version.svg b/docs/img/benchmark_umi-transfer-version.svg new file mode 100644 index 0000000..82fe93d --- /dev/null +++ b/docs/img/benchmark_umi-transfer-version.svg @@ -0,0 +1,877 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From bf57a713df2defb496492ccf5d5bfe4853f47b3a Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Tue, 28 May 2024 15:06:18 +0200 Subject: [PATCH 13/15] Update CLI help text for maximum thread number. --- src/umi_external.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/umi_external.rs b/src/umi_external.rs index 21ca7f1..fec0983 100644 --- a/src/umi_external.rs +++ b/src/umi_external.rs @@ -32,7 +32,7 @@ pub struct OptsExternal { #[clap( short = 't', long = "threads", - help = "Number of threads to use for processing. Defaults to the number of logical cores available. + help = "Maximum number of threads to use for processing. Preferably pick odd numbers, 9 or 11 recommended. Defaults to the maximum number of cores available. \n " )] num_threads: Option, From 35856612c66b23c79ccb95c31c71ee9a43d17544 Mon Sep 17 00:00:00 2001 From: Matthias Zepper Date: Mon, 3 Jun 2024 21:06:47 +0200 Subject: [PATCH 14/15] Small fixes in the Readme. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2460b72..9588e18 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ This tool efficiently integrates these separate UMIs into the headers and can al ### Binary Installation -Binaries for `umi-transfer` are available for most platforms and can be obtained from the [Releases page on GitHub](https://github.com/SciLifeLab/umi-transfer/releases). Simply navigate to the Releases page and download the appropriate binary of a release for your operating system. Once downloaded, you can place it in a directory of your choice and [optionally add the binary to your system's `$PATH`](https://astrobiomike.github.io/unix/modifying_your_path). +Binaries for `umi-transfer` are available for most platforms and can be obtained from the [_Releases_ page on GitHub](https://github.com/SciLifeLab/umi-transfer/releases). Simply navigate to the releases and download the appropriate binary for your operating system. Once downloaded, you can place it in a directory of your choice and [optionally add the binary to your system's `$PATH`](https://astrobiomike.github.io/unix/modifying_your_path). ### Bioconda @@ -85,7 +85,7 @@ alias umi-transfer="docker run -t -v `pwd`:`pwd` -w `pwd` mzscilifelab/umi-trans ### Compile from source -Given that you have [rust installed](https://www.rust-lang.org/tools/install) on your computer, download this repository and run +Given that you have [Rust installed](https://www.rust-lang.org/tools/install) on your computer, clone or download this repository and run ```shell cargo build --release From 4af3d9b65a4ec76c1eac556c4204f9a4137271b5 Mon Sep 17 00:00:00 2001 From: Matthias Zepper <6963520+MatthiasZepper@users.noreply.github.com> Date: Mon, 17 Jun 2024 18:03:37 +0200 Subject: [PATCH 15/15] Update README.md Co-authored-by: Johannes Alneberg --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 9588e18..06208b9 100644 --- a/README.md +++ b/README.md @@ -179,7 +179,6 @@ umi-transfer external --in read1.fastq --in2 read1.fastq --umi read2.fastq --out ### Benchmarks and parameter recommendations -A known shortcoming of version 1.0 of `umi-transfer` was the purely single-threaded output file compression, which significantly slowed down the tool. To mitigate this, we recommended using FIFOs and piping the uncompressed output to a dedicated compression tool like [`pigz`](https://github.com/madler/pigz). With the release of version 1.5, `umi-transfer` features internal multi-threaded output compression. As a result, `umi-transfer` 1.5 now runs approximately 25 times faster than version 1.0 when using internal compression and about twice as fast compared to using an external compression tool. This improvement is enabled by the outstanding [`gzp` crate](https://github.com/sstadick/gzp), which abstracts a lot of the underlying complexity away from the main software.