From a13b57d04a3f3741eedd1af10fd96a9bee126f55 Mon Sep 17 00:00:00 2001 From: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:01:32 +0100 Subject: [PATCH] Add bases2fastq (#167) --- CHANGELOG.md | 2 + src/bases2fastq/config.vsh.yaml | 200 ++++++++++++++++++++++++++++++++ src/bases2fastq/help.txt | 40 +++++++ src/bases2fastq/script.sh | 111 ++++++++++++++++++ src/bases2fastq/test.sh | 200 ++++++++++++++++++++++++++++++++ 5 files changed, 553 insertions(+) create mode 100644 src/bases2fastq/config.vsh.yaml create mode 100644 src/bases2fastq/help.txt create mode 100644 src/bases2fastq/script.sh create mode 100644 src/bases2fastq/test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c71bb4b..02bf7439 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,8 @@ * `sgedemux`: demultiplexing sequencing data generated on Singular Genomics' sequencing instruments (PR #166). +* `bases2fasta`: demultiplexing sequencing data generated by Element Biosciences instruments (PR #167). + ## BUG FIXES * `falco`: Fix a typo in the `--reverse_complement` argument (PR #157). diff --git a/src/bases2fastq/config.vsh.yaml b/src/bases2fastq/config.vsh.yaml new file mode 100644 index 00000000..b50e42fb --- /dev/null +++ b/src/bases2fastq/config.vsh.yaml @@ -0,0 +1,200 @@ +name: bases2fastq +description: | + Bases2Fastq demultiplexes sequencing data generated by Element Biosciences instruments and converts base calls into FASTQ files. +keywords: ["demultiplex", "fastq", "demux", "Element Biosciences"] +links: + documentation: https://docs.elembio.io/docs/bases2fastq/introduction/ +license: Proprietairy +requirements: + commands: [bases2fastq] +authors: + - __merge__: /src/_authors/dries_schaumont.yaml + roles: [ author, maintainer ] + +argument_groups: + - name: Input + arguments: + - name: "--analysis_directory" + type: file + description: Location of analysis directory + required: true + example: "input/" + - name: --run_manifest + alternatives: [-r] + type: file + description: Location of run manifest to use instead of default RunManifest.csv found in analysis directory + required: false + + - name: Output + arguments: + - name: "--output_directory" + alternatives: ["-o"] + type: file + direction: output + required: true + description: Location to save output fastqs + example: fastq_dir + - name: "--report" + type: file + required: false + direction: output + description: Output location for the HTML report + - name: "--logs" + type: file + direction: output + required: false + description: Directory containing log files + example: logs_dir + - name: Arguments + arguments: + - name: --chemistry_version + type: string + required: false + description: Run parameters override, chemistry version. + - name: "--demux_only" + alternatives: [-d] + type: boolean_true + description: | + Generate demux files and indexing stats without generating FASTQ + - name: "--detect_adapters" + type: boolean_true + description: | + Detect adapters sequences, overriding any sequences present in run manifest. + - name: "--error_on_missing" + type: boolean_true + description: | + Terminate execution for a missing file (by default, missing files are + skipped and execution continues). Also set by --strict. + - name: "--exclude_tile" + alternatives: [-e] + multiple: true + type: string + description: | + Regex matching tile names to exclude. This flag can be specified multiple times. (e.g. L1.*C0[23]S.) + - name: "--filter_mask" + type: string + description: | + Run parameters override, custom pass filter mask. + - name: "--flowcell_id" + type: string + description: | + Run parameters override, flowcell ID. + - name: "--force_index_orientation" + type: boolean_true + description: | + Do not attempt to find orientation for I1/I2 reads (reverse complement). + Use orientation given in run manifest. + - name: "--group_fastq" + type: boolean_true + description: | + Group all FASTQ/stats/metrics for a project are in the project folder. + - name: "--i1_cycles" + type: integer + min: 1 + description: | + Run parameters override, I1 cycles. + - name: "--i2_cycles" + type: integer + min: 1 + description: | + Run parameters override, I2 cycles + - name: "--include_tile" + alternatives: [-i] + type: string + multiple: true + description: | + Regex matching tile names to include. This flag + can be specified multiple times. (e.g. L1.*C0[23]S.) + - name: "--kit_configuration" + type: string + description: | + Run parameters override, kit configuration. + - name: "--legacy_fastq" + type: boolean_true + description: | + Legacy naming for FASTQ files (e.g. SampleName_S1_L001_R1_001.fastq.gz) + - name: "--log_level" + type: string + alternatives: [-l] + choices: ["DEBUG", "INFO", "WARNING", "ERROR"] + description: | + Severity level for logging. + example: INFO + - name: "--no_error_on_invalid" + type: boolean_true + description: | + Skip invalid files and continue execution. Overridden by --strict options + - name: "--no_projects" + type: boolean_true + description: | + Disable project directories + - name: "--num_unassigned" + type: integer + min: 0 + max: 1000 + example: 30 + description: | + Max Number of unassigned sequences to report. + - name: "--preparation_workflow" + type: string + description: | + Run parameters override, preparation workflow. + - name: --qc_only + type: boolean_true + description: | + Quickly generate run stats for single tile without generating FASTQ. + Use --include_tile/--exclude_tile to define custom tile set. + - name: --r1_cycles + type: integer + min: 1 + description: | + Run parameters override, R1 cycles. + - name: --r2_cycles + type: integer + min: 1 + description: | + Run parameters override, R2 cycles. + - name: "--split_lanes" + type: boolean_true + description: | + Split FASTQ files by lane. + - name: --strict + type: boolean_true + description: | + In strict mode any invalid or missing input file will terminate execution + (overrides no_error_on_invalid and sets --error_on_missing) + + # --help, -h Display this usage statement + # --input-remote, NAME Rclone remote name for remote ANALYSIS_DIRECTORY + # --num-threads, -p NUMBER Number of threads (default 1) + # --output-remote, NAME Rclone remote name for remote OUTPUT_DIRECTORY + # --settings SELECTION Run manifest settings override. This option may be specified multiple times. + # --version, -v Display bases2fastq version + # --skip-qc-report SELECTION Do not generate HTML QC report. + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: +- type: docker + image: elembio/bases2fastq:2.1.0 + setup: + - type: apt + packages: + - procps + - tree + - type: docker + run: | + echo "bases2fastq: $(bases2fastq --version | cut -d' ' -f3)" > /var/software_versions.txt + test_setup: + - type: apt + packages: curl + +runners: + - type: executable + - type: nextflow diff --git a/src/bases2fastq/help.txt b/src/bases2fastq/help.txt new file mode 100644 index 00000000..2fcadf9b --- /dev/null +++ b/src/bases2fastq/help.txt @@ -0,0 +1,40 @@ + +Usage: bases2fastq [OPTIONS] ANALYSIS_DIRECTORY OUTPUT_DIRECTORY + +positional arguments: + ANALYSIS_DIRECTORY Location of analysis directory + OUTPUT_DIRECTORY Location to save output + +optional arguments: + --chemistry-version VERSION Run parameters override, chemistry version. + --demux-only, -d Generate demux files and indexing stats without generating FASTQ + --detect-adapters Detect adapters sequences, overriding any sequences present in run manifest. + --error-on-missing Terminate execution for a missing file (by default, missing files are skipped and execution continues). Also set by --strict. + --exclude-tile, -e SELECTION Regex matching tile names to exclude. This flag can be specified multiple times. (e.g. L1.*C0[23]S.) + --filter-mask MASK Run parameters override, custom pass filter mask. + --flowcell-id FLOWCELL_ID Run parameters override, flowcell ID. + --force-index-orientation Do not attempt to find orientation for I1/I2 reads (reverse complement). Use orientation given in run manifest. + --group-fastq Group all FASTQ/stats/metrics for a project are in the project folder (default false) + --help, -h Display this usage statement + --i1-cycles NUM_CYCLES Run parameters override, I1 cycles. + --i2-cycles NUM_CYCLES Run parameters override, I2 cycles. + --include-tile, -i SELECTION Regex matching tile names to include. This flag can be specified multiple times. (e.g. L1.*C0[23]S.) + --input-remote, NAME Rclone remote name for remote ANALYSIS_DIRECTORY + --kit-configuration KIT_CONFIG Run parameters override, kit configuration. + --legacy-fastq Legacy naming for FASTQ files (e.g. SampleName_S1_L001_R1_001.fastq.gz) + --log-level, -l LEVEL Severity level for logging. i.e. DEBUG, INFO, WARNING, ERROR (default INFO) + --no-error-on-invalid Skip invalid files and continue execution (by default, execution is terminated for an invalid file). Overridden by --strict options. + --no-projects Disable project directories (default false) + --num-threads, -p NUMBER Number of threads (default 1) + --num-unassigned NUMBER Max Number of unassigned sequences to report. Must be <= 1000 (default 30) + --output-remote, NAME Rclone remote name for remote OUTPUT_DIRECTORY + --preparation-workflow WORKFLOW Run parameters override, preparation workflow. + --qc-only Quickly generate run stats for single tile without generating FASTQ. Use --include-tile/--exclude-tile to define custom tile set. + --r1-cycles NUM_CYCLES Run parameters override, R1 cycles. + --r2-cycles NUM_CYCLES Run parameters override, R2 cycles. + --run-manifest, -r PATH Location of run manifest to use instead of default RunManifest.csv found in analysis directory + --settings SELECTION Run manifest settings override. This option may be specified multiple times. + --skip-qc-report SELECTION Do not generate HTML QC report. + --split-lanes Split FASTQ files by lane + --strict, -s In strict mode any invalid or missing input file will terminate execution (overrides no-error-on-invalid and sets --error-on-missing) + --version, -v Display bases2fastq version \ No newline at end of file diff --git a/src/bases2fastq/script.sh b/src/bases2fastq/script.sh new file mode 100644 index 00000000..988acbc4 --- /dev/null +++ b/src/bases2fastq/script.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# Exit on error +set -eo pipefail + +# Unset parameters +unset_if_false=( + par_demux_only + par_detect_adapters + par_error_on_missing + par_group_fastq + par_legacy_fastq + par_no_error_on_invalid + par_no_projects + par_qc_only + par_split_lanes + par_skip_qc_report + par_strict + par_force_index_orientation +) + +for par in ${unset_if_false[@]}; do + test_val="${!par}" + [[ "$test_val" == "false" ]] && unset $par +done + +# NOTE: --preparation-workflow is bugged in bases2fastq +args=( + ${par_demux_only:+--demux-only} + ${par_detect_adapters:+--detect-adapters} + ${par_error_on_missing:+--error-on-missing} + ${par_group_fastq:+--group-fastq} + ${par_legacy_fastq:+--legacy-fastq} + ${par_no_error_on_invalid:+--no-error-on-invalid} + ${par_no_projects:+--no-projects} + ${par_split_lanes:+--split-lanes} + ${par_strict:+--strict} + ${par_force_index_orientation:+--force-index-orientation} + ${par_chemistry_version:+--chemistry-version "$par_chemistry_version"} + ${par_filter_mask:+--filter-mask "$par_filter_mask"} + ${par_flowcell_id:+--flowcell-id "$par_flowcell_id"} + ${par_i1_cycles:+--i1-cycles "$par_i1_cycles"} + ${par_i2_cycles:+--i2-cycles "$par_i2_cycles"} + ${par_r1_cycles:+--r1-cycles "$par_r1_cycles"} + ${par_r2_cycles:+--r2-cycles "$par_r2_cycles"} + ${par_kit_configuration:+--kit-configuration "$par_kit_configuration"} + ${par_log_level:+--log-level "$par_log_level"} + ${par_num_unassigned:+--num-unassigned "$par_num_unassigned"} + ${par_preparation_workflow:+--preparation-workflow "$par_preparation_workflow"} + ${meta_cpus:+--num-threads "$meta_cpus"} + ${par_run_manifest:+--run-manifest "$par_run_manifest"} +) + +# Create arrays for inputs that contain multiple arguments +IFS=";" read -ra exclude_tile <<< "$par_exclude_tile" +IFS=";" read -ra include_tile <<< "$par_include_tile" + +if [ -z "$par_report" ]; then + args+=( --skip-qc-report ) +fi + +for arg_value in "${exclude_tile[@]}"; do + args+=( "--exclude-tile" "$arg_value" ) +done + +for arg_value in "${include_tile[@]}"; do + args+=( "--include-tile" "$arg_value" ) +done + +echo "> Creating temporary directory." +# create temporary directory and clean up on exit +TMPDIR=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXX") +echo "> Created $TMPDIR" +function clean_up { + [[ -d "$TMPDIR" ]] && rm -rf "$TMPDIR" +} +trap clean_up EXIT + +args+=( "$par_analysis_directory" "$TMPDIR") +echo "> Running bases2fastq with arguments: ${args[@]}" +bases2fastq ${args[@]} +echo "> Done running sgdemux" + +echo "> Output folder:" +tree "$TMPDIR" + +echo "> Moving FASTQ files into final output directory" +mkdir -p "$par_output_directory/" +mv "$TMPDIR"/Samples/* --target-directory="$par_output_directory" + +if [ ! -z "$par_report" ]; then + echo "> Moving HTML report to the output ($par_report)" + mv "$TMPDIR"/*.html "$par_report" +else + echo " > Leaving reports alone" +fi + +# Logs is everything else +if [ ! -z "$par_logs" ]; then + mkdir -p "$par_logs" + echo "> Moving logs to their own location ($par_logs)" + mv "$TMPDIR/"* "$par_logs/" +else + echo "> Not moving logs" +fi + + + diff --git a/src/bases2fastq/test.sh b/src/bases2fastq/test.sh new file mode 100644 index 00000000..9334e343 --- /dev/null +++ b/src/bases2fastq/test.sh @@ -0,0 +1,200 @@ +#!/bin/bash + +set -eou pipefail + +# Helper functions +assert_file_exists() { + [ -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} + +assert_file_not_exists() { + [ ! -f "$1" ] || { echo "File '$1' does not exist" && exit 1; } +} + +assert_directory_exists() { + [ -d "$1" ] || { echo "Directory '$1' does not exist" && exit 1; } +} + +assert_file_not_empty() { + [ -s "$1" ] || { echo "File '$1' is empty but shouldn't be" && exit 1; } +} +assert_file_contains() { + grep -q "$2" "$1" || { echo "File '$1' does not contain '$2'" && exit 1; } +} + +# Example output +# Note that the format of the fastq file names and organization into subfolders +# can differ based on the arguments provided to bases2fastq + +# |-- 20230404-Bases2Fastq-Sim_QC.html +# |-- IndexAssignment.csv +# |-- Metrics.csv +# |-- RunManifest.csv +# |-- RunManifest.json +# |-- RunParameters.json +# |-- RunStats.json +# |-- Samples +# | |-- DefaultProject +# | | |-- DefaultProject_IndexAssignment.csv +# | | |-- DefaultProject_Metrics.csv +# | | |-- DefaultProject_QC.html +# | | |-- DefaultProject_RunStats.json +# | | |-- sample_0 +# | | | |-- sample_0_L1_R1.fastq.gz +# | | | |-- sample_0_L1_R2.fastq.gz +# | | | |-- sample_0_L2_R1.fastq.gz +# | | | |-- sample_0_L2_R2.fastq.gz +# | | | `-- sample_0_stats.json +# | | |-- sample_1 +# | | | |-- sample_1_L1_R1.fastq.gz +# | | | |-- sample_1_L1_R2.fastq.gz +# | | | |-- sample_1_L2_R1.fastq.gz +# | | | |-- sample_1_L2_R2.fastq.gz +# | | | `-- sample_1_stats.json +# | | |-- sample_2 +# | | | |-- sample_2_L1_R1.fastq.gz +# | | | |-- sample_2_L1_R2.fastq.gz +# | | | |-- sample_2_L2_R1.fastq.gz +# | | | |-- sample_2_L2_R2.fastq.gz +# | | | `-- sample_2_stats.json +# | | |-- sample_3 +# | | | |-- sample_3_L1_R1.fastq.gz +# | | | |-- sample_3_L1_R2.fastq.gz +# | | | |-- sample_3_L2_R1.fastq.gz +# | | | |-- sample_3_L2_R2.fastq.gz +# | | | `-- sample_3_stats.json +# | | `-- sample_4 +# | | |-- sample_4_L1_R1.fastq.gz +# | | |-- sample_4_L1_R2.fastq.gz +# | | |-- sample_4_L2_R1.fastq.gz +# | | |-- sample_4_L2_R2.fastq.gz +# | | `-- sample_4_stats.json +# | `-- Unassigned +# | |-- Unassigned_L1_R1.fastq.gz +# | |-- Unassigned_L1_R2.fastq.gz +# | |-- Unassigned_L2_R1.fastq.gz +# | `-- Unassigned_L2_R2.fastq.gz +# |-- UnassignedSequences.csv +# `-- info +# |-- Bases2Fastq.log +# `-- RunManifestErrors.json + + +# create temporary directory and clean up on exit +TMPDIR=$(mktemp -d "$meta_temp_dir/$meta_name-XXXXXX") +function clean_up { + [[ -d "$TMPDIR" ]] && rm -rf "$TMPDIR" +} +trap clean_up EXIT + +# Unpack test input files +TAR_DIR="$TMPDIR/tar" +mkdir -p "$TAR_DIR" +curl http://element-public-data.s3.amazonaws.com/bases2fastq-share/bases2fastq-v2/20230404-bases2fastq-sim-151-151-9-9.tar.gz \ +-o "$TAR_DIR/20230404-bases2fastq-sim-151-151-9-9.tar.gz" + +BCL_DIR="$TMPDIR/bcl" +mkdir "$BCL_DIR" +tar -xvf "$TAR_DIR/20230404-bases2fastq-sim-151-151-9-9.tar.gz" -C "$BCL_DIR" + +mkdir "$TMPDIR/test1" && pushd "$TMPDIR/test1" > /dev/null +expected_out_dir="$TMPDIR/test1/out" +expected_report="$TMPDIR/report.html" +expected_logs="$TMPDIR/logs" +"$meta_executable" \ + --analysis_directory "$BCL_DIR/20230404-bases2fastq-sim-151-151-9-9" \ + --output_directory "$expected_out_dir" \ + --logs "$expected_logs" \ + --report "$expected_report" \ + --include_tile "L1R02C01S1;L2R21C01S1;L1R02C01S2;L2R21C01S2;L1R03C01S2;L2R20C01S2" \ + --exclude_tile "L1R04C01S1" \ + --chemistry_version 2 \ + --i1_cycles 10 \ + --i2_cycles 10 \ + --r1_cycles 152 \ + --r2_cycles 152 \ + --kit_configuration "300Cycles" \ + --detect_adapters \ + --error_on_missing \ + --flowcell_id foo \ + --force_index_orientation \ + --group_fastq \ + --legacy_fastq \ + --log_level DEBUG \ + --no_projects \ + --num_unassigned 30 \ + --strict \ + --run_manifest "$BCL_DIR/20230404-bases2fastq-sim-151-151-9-9/RunManifest.csv" + +assert_directory_exists "$expected_out_dir" +assert_directory_exists "$expected_logs" +assert_file_exists "$expected_report" +assert_file_not_empty "$expected_report" + +expected_samples=( + Undetermined_S0 + sample_0_S1 + sample_1_S2 + sample_2_S3 + sample_3_S4 + sample_4_S5 +) + +for sample in "${expected_samples[@]}"; do + for lane in "L001" "L002"; do + for orientation in "R1" "R2"; do + assert_file_exists "$expected_out_dir/${sample}_${lane}_${orientation}_001.fastq.gz" + done + done +done +popd > /dev/null + +mkdir "$TMPDIR/test3" && pushd "$TMPDIR/test3" > /dev/null +expected_out_dir="$TMPDIR/test3/out" +"$meta_executable" \ + --analysis_directory "$BCL_DIR/20230404-bases2fastq-sim-151-151-9-9" \ + --output_directory "$expected_out_dir" + +expected_samples=( + sample_0 + sample_1 + sample_2 + sample_3 + sample_4 +) +tree "$expected_out_dir" + +for sample in "${expected_samples[@]}"; do + for orientation in "R1" "R2"; do + assert_file_exists "$expected_out_dir/DefaultProject/${sample}/${sample}_${orientation}.fastq.gz" + done +done +assert_file_exists "$expected_out_dir/Unassigned/Unassigned_R1.fastq.gz" +assert_file_exists "$expected_out_dir/Unassigned/Unassigned_R2.fastq.gz" +popd > /dev/null + +mkdir "$TMPDIR/test4" && pushd "$TMPDIR/test4" > /dev/null +expected_out_dir="$TMPDIR/test4/out" +"$meta_executable" \ + --analysis_directory "$BCL_DIR/20230404-bases2fastq-sim-151-151-9-9" \ + --output_directory "$expected_out_dir" \ + --split_lanes + +expected_samples=( + "Unassigned/Unassigned" + DefaultProject/sample_0/sample_0 + DefaultProject/sample_1/sample_1 + DefaultProject/sample_2/sample_2 + DefaultProject/sample_3/sample_3 + DefaultProject/sample_4/sample_4 +) +tree "$expected_out_dir" + +for sample in "${expected_samples[@]}"; do + for lane in "L1" "L2"; do + for orientation in "R1" "R2"; do + assert_file_exists "$expected_out_dir/${sample}_${lane}_${orientation}.fastq.gz" + done + done +done +popd > /dev/null